In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, load_from_disk

In [7]:
# 加载数据集
dataset = load_dataset(path='glue', name='sst2')
# dataset.save_to_disk(dataset_dict_path="data/glue_sst2")
# dataset = load_from_disk('data/glue_sst2')

# from datasets import Dataset
# dir(Dataset)
dataset

Found cached dataset glue (/home/codespace/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
# 加载分词工具
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# 分词
def f(data):
    return tokenizer(
        data['sentence'],
        padding='max_length',
        truncation=True,
        max_length=30,
    )

# map 方法
dataset = dataset.map(f, batched=True, batch_size=1000, num_proc=4)

# 取数据子集
dataset_train = dataset['train'].shuffle(seed=42).select(range(1000))
dataset_eval = dataset['validation'].shuffle(seed=42).select(range(200))
dataset_test = dataset['test'].shuffle(seed=42).select(range(200))

# del datasets
# dataset_train, dataset_eval, dataset_test

In [None]:
# 加载模型
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased',
                                                           num_labels=2)

print(sum([i.nelement() for i in model.parameters()]) / 10000)

In [None]:
import numpy as np
from datasets import load_metric
from transformers.trainer_utils import EvalPrediction

# 加载评价函数
metric = load_metric('accuracy')

# 定义评价函数
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = logits.argmax(axis=1)
    return metric.compute(predictions=logits, references=labels)


# 模拟测试输出
eval_pred = EvalPrediction(
    predictions=np.array([[0, 1], [2, 3], [4, 5], [6, 7]]),
    label_ids=np.array([1, 1, 1, 1]),
)

compute_metrics(eval_pred)

In [21]:
from transformers import TrainingArguments, Trainer

# 初始化训练参数
args = TrainingArguments(output_dir='output_dir', evaluation_strategy='epoch')
args.num_train_epochs = 1
args.learning_rate = 1e-4
args.weight_decay = 1e-2
args.per_device_eval_batch_size = 32
args.per_device_train_batch_size = 16

# 初始化训练器
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
)

# 评价模型
# trainer.evaluate()  # 未训练之前模型的准确率
# # 训练
trainer.train()
# # 评价模型
# trainer.evaluate()
# # 保存模型
# trainer.save_model(output_dir='output_dir')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 63
  Number of trainable parameters = 108311810


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence, idx. If sentence, idx are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 32


IndexError: Target -1 is out of bounds.

In [None]:
# 批处理函数
def collate_fn(data):
    label = [i['label'] for i in data]
    input_ids = [i['input_ids'] for i in data]
    token_type_ids = [i['token_type_ids'] for i in data]
    attention_mask = [i['attention_mask'] for i in data]

    label = torch.LongTensor(label)
    input_ids = torch.LongTensor(input_ids)
    token_type_ids = torch.LongTensor(token_type_ids)
    attention_mask = torch.LongTensor(attention_mask)

    return label, input_ids, token_type_ids, attention_mask


# 数据加载器
loader_test = torch.utils.data.DataLoader(dataset=dataset_test,
                                          batch_size=4,
                                          collate_fn=collate_fn,
                                          shuffle=True,
                                          drop_last=True)

# for i, (label, input_ids, token_type_ids,
#         attention_mask) in enumerate(loader_test):
#     break

# label, input_ids, token_type_ids, attention_mask

In [None]:



#测试
def test():
    #加载参数
    model.load_state_dict(torch.load('./output_dir/pytorch_model.bin'))

    model.eval()

    #运算
    out = model(input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask)

    #[4, 2] -> [4]
    out = out['logits'].argmax(dim=1)

    correct = (out == label).sum().item()

    return correct / len(label)


test()