In [1]:
from datasets import load_dataset
from transformers import BertTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
import jieba
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split


In [18]:
# 读取数据
data = load_dataset('csv', data_files='weibo_senti_100k.csv', split='train')
data

Dataset({
    features: ['label', 'review'],
    num_rows: 119988
})

In [19]:
data = data.filter(lambda x: x['review'] is not None and x['label'] is not None)
data

Dataset({
    features: ['label', 'review'],
    num_rows: 119988
})

In [20]:
data = data.train_test_split(test_size=0.2)
data

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 95990
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 23998
    })
})

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

def process_function(data):
    data['tokens'] = " ".join(jieba.lcut(data['review']))
    tokenized_data = tokenizer(data['tokens'], max_length=128, truncation=True)
    tokenized_data['labels'] = data['label']
    return tokenized_data



In [10]:
# 分词
def chinese_word_cut(text):
    text['tokens'] = " ".join(jieba.lcut(text['review']))
    return text

In [25]:
tokenized_dataset = data.map(process_function, remove_columns=data['train'].column_names)
tokenized_dataset

Map:   0%|          | 0/95990 [00:00<?, ? examples/s]

Map:   0%|          | 0/23998 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 95990
    })
    test: Dataset({
        features: ['tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 23998
    })
})

In [30]:
# 创建模型和优化器
from transformers import BertForSequenceClassification, AdamW
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)
model.config

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import evaluate

acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

In [None]:
def compute_metrics(eval_predict):
    preds, labels = eval_predict
    preds = preds.argmax(axis=1)
    acc = acc_metric.compute(predictions=preds, references=labels)
    f1 = f1_metric.compute(predictions=preds, references=labels)
    return {
        'accuracy': acc,
        'f1': f1
    }

In [None]:
train_args = TrainingArguments(
    output_dir='./checkpoints',
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=128,
    warmup_steps=500,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=3,
    metric_for_best_model='accuracy',
    load_best_model_at_end=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics
)

In [None]:
# 使用peft库的lora进行微调
trainer.train()

In [None]:
trainer.train()

In [None]:
trainer.evaluate(tokenized_dataset['test'])