# Hugging Face Transformers 微调训练入门

## 1. 下载数据集

In [1]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

## 2. 预处理数据

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert_base_cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

## 3. 加载Bert模型

In [3]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert_base_cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert_base_cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4. 设置超参数和评估指标

In [4]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

model_dir = "/root/autodl-tmp/yelp_bert"
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir=model_dir,
    eval_strategy="steps",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    save_strategy="steps",
    save_steps=5000,
    eval_steps=5000,
)

## 5. 开始训练

In [6]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [7]:
trainer.evaluate()

{'eval_loss': 1.7103939056396484,
 'eval_accuracy': 0.20018,
 'eval_runtime': 514.5202,
 'eval_samples_per_second': 97.178,
 'eval_steps_per_second': 3.038}

In [8]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
5000,0.8007,0.76766,0.6637
10000,0.7443,0.73562,0.67894
15000,0.7197,0.708545,0.68836
20000,0.698,0.695257,0.6954


TrainOutput(global_step=20313, training_loss=0.7629087207084133, metrics={'train_runtime': 24136.6982, 'train_samples_per_second': 26.93, 'train_steps_per_second': 0.842, 'total_flos': 1.710267926016e+17, 'train_loss': 0.7629087207084133, 'epoch': 1.0})

In [11]:
trainer.save_model(model_dir)