In [1]:
import numpy as np
from transformers import AutoTokenizer, DataCollatorWithPadding
import datasets

checkpoint = 'bert-base-chinese'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

data_files = {"train": "./data/sougou/train.csv", "test": "./data/sougou/test.csv"}
raw_datasets = datasets.load_dataset("csv", data_files=data_files, delimiter=",")

Found cached dataset csv (/home/jclian/.cache/huggingface/datasets/csv/default-25ea387b8c9da915/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 495
    })
})

In [11]:
def tokenize_function(sample):
    return tokenizer(sample['text'], max_length=128, truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at /home/jclian/.cache/huggingface/datasets/csv/default-25ea387b8c9da915/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-b5ee1d8f408cba64.arrow
Loading cached processed dataset at /home/jclian/.cache/huggingface/datasets/csv/default-25ea387b8c9da915/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-608905025d40a966.arrow


In [12]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(output_dir='sougou_test_trainer_128', # 指定输出文件夹，没有会自动创建
                                 evaluation_strategy="epoch",
                                 per_device_train_batch_size=32,
                                 per_device_eval_batch_size=32,
                                 learning_rate=5e-5,
                                 num_train_epochs=3,
                                 warmup_ratio=0.2,
                                 logging_dir='./sougou_train_logs',
                                 logging_strategy="epoch",
                                 save_strategy="epoch",
                                 report_to="tensorboard") 

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,  # 在定义了tokenizer之后，其实这里的data_collator就不用再写了，会自动根据tokenizer创建
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [14]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.8492,0.115189,0.969697,0.969449,0.970073,0.969697
2,0.1069,0.093987,0.973737,0.97377,0.975372,0.973737
3,0.0478,0.078861,0.973737,0.97374,0.974117,0.973737




TrainOutput(global_step=96, training_loss=0.3346322464446227, metrics={'train_runtime': 98.4099, 'train_samples_per_second': 121.939, 'train_steps_per_second': 0.976, 'total_flos': 789354427392000.0, 'train_loss': 0.3346322464446227, 'epoch': 3.0})