In [21]:
import torch
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import  DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import AdamW,get_scheduler


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [22]:
# 加载数据集 
raw_datasets = load_dataset('glue', 'mrpc')  # mrpc判断句子对
raw_datasets  # DatasetDict

Found cached dataset glue (C:/Users/lizhong/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [29]:
# 加载tokenizer工具
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# 编码
def tokenize_function(example):
    return tokenizer(example['sentence1'],
                    example['sentence2'],
                    truncation=True) # 无需return_tensor="pt", 在 data_collator中转化

# map映射，保持dataset格式
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# 批处理函数，padding+转换tensor
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 删除不需要的字段
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])

# 将字段label改为labels，将list转换为pytorch tensor
tokenized_datasets['train'].column_names  # ['label', 'input_ids', 'token_type_ids', 'attention_mask']

tokenized_datasets['train'][0]

Loading cached processed dataset at C:\Users\lizhong\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-0934c045dd1afe10.arrow
Loading cached processed dataset at C:\Users\lizhong\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-0ca0a34349921406.arrow
Loading cached processed dataset at C:\Users\lizhong\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-c3b222f285c62abb.arrow


{'label': 1,
 'input_ids': [101,
  2572,
  3217,
  5831,
  5496,
  2010,
  2567,
  1010,
  3183,
  2002,
  2170,
  1000,
  1996,
  7409,
  1000,
  1010,
  1997,
  9969,
  4487,
  23809,
  3436,
  2010,
  3350,
  1012,
  102,
  7727,
  2000,
  2032,
  2004,
  2069,
  1000,
  1996,
  7409,
  1000,
  1010,
  2572,
  3217,
  5831,
  5496,
  2010,
  2567,
  1997,
  9969,
  4487,
  23809,
  3436,
  2010,
  3350,
  1012,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [26]:
%who

AdamW	 AutoModelForSequenceClassification	 AutoTokenizer	 DataCollatorWithPadding	 DataLoader	 batch	 checkpoint	 data_collator	 device	 
epoch	 eval_dataloader	 get_scheduler	 load_dataset	 load_metric	 loss	 lr_scheduler	 metric	 model	 
num_epochs	 num_training_steps	 optimizer	 outputs	 pbar	 progress_bar	 raw_datasets	 test	 tokenize_function	 
tokenized_datasets	 tokenizer	 torch	 tqdm	 train	 train_dataloader	 


In [30]:
# 定义dataloader
train_dataloader = DataLoader(tokenized_datasets['train'],
                             shuffle=True,
                             batch_size=8,
                             collate_fn=data_collator)

eval_dataloader = DataLoader(tokenized_datasets['validation'],
                            batch_size=8,
                            collate_fn=data_collator)

for batch in train_dataloader:
    break;

batch  # transformers.tokenization_utils_base.BatchEncoding

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  101,  7513,  2003,  8225,  2000, 11477,  2049,  4274, 10566, 16602,
          1010,  2206,  1037,  7353, 14392,  2008,  2253,  2114,  1996,  2194,
          1010,  7513,  2056,  5958,  1012,   102,  7513, 13058,  1012,  2003,
          8225,  3431,  2000,  2049,  4274, 10566,  1006, 29464,  1007, 16602,
          2138,  1997,  1037,  7353, 14392,  2114,  2009,  1010,  1996,  2194,
          2056,  5958,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101,  1996,  2446,  5446,  3757,  2006,  6928,  2056,  2045,  2001,
          2053,  2553,  2005,  3730,  7406,  1012,   102,  6983,  5446,  2704,
         14405,  3775, 10556,  6894,  9626,  3211,  2056,  1024,  1000,  2045,
          2003,  2053,  2282,  2005,  1037,  3730,  7406,  1012,  1000,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0, 

In [31]:
# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [32]:
outputs = model(**batch)  
# type(outputs) transformers.modeling_outputs.SequenceClassifierOutput
# dir(outputs)
# outputs, 
# outputs.items()
# outputs.keys()
outputs.loss, outputs.logits.shape

['__annotations__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__or__',
 '__post_init__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__ror__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'attentions',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'hidden_states',
 'items',
 'keys',
 'logits',
 'loss',
 'move_to_end',
 'pop',
 'popitem',
 'setdefault',
 'to_tuple',
 'update',
 'values']

In [8]:
# 设置优化器
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs*len(train_dataloader)
lr_scheduler = get_scheduler("linear",
                            optimizer=optimizer,
                            num_warmup_steps=0,
                            num_training_steps=num_training_steps)



In [9]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [15]:
from tqdm import tqdm

def train():
    pbar = tqdm(range(num_training_steps))  # 因为这里为两层epoch和step
    total = 0
                    
    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k:v.to(device) for k,v in batch.items()}

            # 正向传播
            outputs = model(**batch)
            loss = outputs.loss  # 只要传入labels，就会计算loss

            # 反向传播，计算梯度
            loss.backward()

            # 参数更新
            optimizer.step()
            lr_scheduler.step()
            # 梯度归零
            optimizer.zero_grad()
            total = total+1
            if(total%50==0):
                pbar.update(50)

if __name__ == "__main__":
    print("start...")
    train()
    print("end...")

start...




  0%|                                                                                         | 0/1377 [00:00<?, ?it/s][A[A

  4%|██▉                                                                             | 50/1377 [00:07<03:06,  7.11it/s][A[A

  7%|█████▋                                                                         | 100/1377 [00:13<02:54,  7.32it/s][A[A

 11%|████████▌                                                                      | 150/1377 [00:20<02:47,  7.35it/s][A[A

 15%|███████████▍                                                                   | 200/1377 [00:27<02:38,  7.40it/s][A[A

 18%|██████████████▎                                                                | 250/1377 [00:33<02:30,  7.48it/s][A[A

 22%|█████████████████▏                                                             | 300/1377 [00:40<02:24,  7.43it/s][A[A

 25%|████████████████████                                                           | 350/1377 [00:47<02:17, 

end...





In [17]:
from datasets import load_metric
metric = load_metric('glue', 'mrpc')

def test():
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k,v in batch.items()}
    #     temp = {k: v.shape for k,v in batch.items()}
    #     print(temp)
        with torch.no_grad():
            outputs = model(**batch)
    #     print(outputs)
        logits = outputs.logits
    #     print(logits.shape)
        predictions = torch.argmax(logits, dim=-1)
    #     print(predictions.shape, batch['labels'].shape)
        metric.add_batch(predictions=predictions, 
                         references=batch["labels"])
    print(metric.compute())
    
if __name__ == "__main__":
    print("start...")
    test()
    print("end...")

start...
{'accuracy': 0.8602941176470589, 'f1': 0.9028960817717206}
end...
