In [2]:
#| hide
# !pip install datasets evaluate transformers[sentencepiece]
from transformers import AutoTokenizer

In [61]:
#| export
import time

# 分词器(Tokenizer)

> Tokenizer用于把`str`转换成对应的`索引`,也可以把索引转换回来。

1.创建一个bert的分词器，

In [3]:
tokenizer=AutoTokenizer.from_pretrained('prajjwal1/bert-medium')

2.定义我们需要进行分词的tokens.

In [5]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
    "I love her"
]

3.使用`padding`保证返回的长度一致,并设置返回数据类型为torch.tensor

In [11]:
inputs=tokenizer(raw_inputs,padding=True,return_tensors='pt')
print(tokenizer.decode(inputs['input_ids'][0]))
print(tokenizer.decode(inputs['input_ids'][1]))
print(tokenizer.decode(inputs['input_ids'][2]))

[CLS] i've been waiting for a huggingface course my whole life. [SEP]
[CLS] i hate this so much! [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] i love her [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


4.有时需要使用`max_length`,`truncation`参数，指定分词串的最大长度。

In [10]:
inputs=tokenizer(raw_inputs,padding=True,max_length=8,truncation=True,return_tensors='pt')
print(tokenizer.decode(inputs['input_ids'][0]))
print(tokenizer.decode(inputs['input_ids'][1]))
print(tokenizer.decode(inputs['input_ids'][2]))

[CLS] i've been waiting for [SEP]
[CLS] i hate this so much! [SEP]
[CLS] i love her [SEP] [PAD] [PAD] [PAD]


5.打印分词结果，可以看到bert tokenizer还返回`token_type_ids`,`attention_mask`

In [18]:
for x in inputs.items():
    print(f"{x[0]}:{x[1].shape}")

input_ids:torch.Size([3, 16])
token_type_ids:torch.Size([3, 16])
attention_mask:torch.Size([3, 16])


## tokenizer连接句子

> 把2个tokens数组 分别作为参数传递给`tokenizer`，即可对这两句子连接并转换成索引,句子之间会自动加入`[SEP]`。

In [20]:
raw_inputs1 = [
    "I love her"
]
raw_inputs2 = [
    "I hate this so much!",
]
inputs=tokenizer(raw_inputs1,raw_inputs2,padding=True,return_tensors='pt')
print(tokenizer.decode(inputs['input_ids'][0]))

[CLS] i love her [SEP] i hate this so much! [SEP]


## tokenizer的重要属性

In [11]:
print(len(tokenizer.get_vocab()))
print(tokenizer.pad_token_id,tokenizer.pad_token)
print(tokenizer.sep_token_id,tokenizer.sep_token)
print(tokenizer.eos_token_id,tokenizer.eos_token)

Using eos_token, but it is not set yet.


30522
0 [PAD]
102 [SEP]
None None


# 模型

## 基础模型：AutoModel

In [22]:
#| hide
from transformers import AutoModel

In [None]:
# model=AutoModel.from_pretrained('prajjwal1/bert-medium')
model=AutoModel.from_pretrained('prajjwal1/bert-medium',ignore_mismatched_sizes=True)

使用``model(**inputs)``对输入进行编码

In [29]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
    "I love her"
]
inputs=tokenizer(raw_inputs,padding=True,return_tensors='pt')
output=model(**inputs)
output.last_hidden_state.shape

torch.Size([3, 16, 512])

和pytorch的model一样，可以使用`get_submodule`查看模型的结构

In [30]:
model.get_submodule("pooler")

BertPooler(
  (dense): Linear(in_features=512, out_features=512, bias=True)
  (activation): Tanh()
)

##  分类模型：AutoModelForSequenceClassification

In [31]:
#| hide
from transformers import AutoModelForSequenceClassification

In [34]:
model=AutoModelForSequenceClassification.from_pretrained(
    'prajjwal1/bert-medium',                                                     
     num_labels=33,  #覆盖classifer header
     ignore_mismatched_sizes=True)

Some weights of the model checkpoint at prajjwal1/bert-medium were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not init

In [33]:
model.get_submodule('classifier')

Linear(in_features=512, out_features=33, bias=True)

## 生成模型:AutoModelForCausalLM

In [35]:
#| hide
from transformers import AutoModelForCausalLM

In [36]:
model=AutoModelForCausalLM.from_pretrained('prajjwal1/bert-medium',ignore_mismatched_sizes=True)
model.get_submodule('cls')

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of the model checkpoint at prajjwal1/bert-medium were not used when initializing BertLMHeadModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertOnlyMLMHead(
  (predictions): BertLMPredictionHead(
    (transform): BertPredictionHeadTransform(
      (dense): Linear(in_features=512, out_features=512, bias=True)
      (transform_act_fn): GELUActivation()
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
    )
    (decoder): Linear(in_features=512, out_features=30522, bias=True)
  )
)

## 模型的保存与加载

In [None]:
model.save_pretrained('zxk/my-bert')
model=AutoModel.from_pretrained('zxk/my-bert')

# Model_Config(MetaData)

In [49]:
from transformers import BertConfig, BertModel,AutoConfig

In [57]:
#原型，所以也是最不重要的，因为我们机会不会从头训练
# config=BertConfig(vocab_size=200,initializer_range=0.3)
config=BertConfig.from_pretrained('prajjwal1/bert-medium',vocab_size=200,initializer_range=0.3)
print(config)
model=BertModel(config)
model.get_submodule('embeddings')

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.3,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 200
}



BertEmbeddings(
  (word_embeddings): Embedding(200, 512, padding_idx=0)
  (position_embeddings): Embedding(512, 512)
  (token_type_embeddings): Embedding(2, 512)
  (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

# 数据集

## 数据集的下载，浏览

In [51]:
from datasets import load_dataset

In [41]:
# glue是由10多种数据集(cola,sst2,mrpc,qqp...)构成的benckmark
# mrpc是数据集，比较2句话是否是一个意思
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

Found cached dataset glue (/Users/zhanggxk/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

查看数据集，我们查看训练数据集，索引为1的数据

In [42]:
raw_datasets['train'][1:2]
# 理解数据
# s1:在1998年，Yucaipa公司拥有Dominick's超市连锁店，
# 随后以25亿美元的价格将该连锁店出售给Safeway公司。

# s2:1995年Yucaipa以6.93亿美元的价格购买了Dominick's超市连锁店，
# 然后在1998年以18亿美元的价格将其出售给了Safeway公司。
# label:0，两句话无关
# idx:1

{'sentence1': ["Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion ."],
 'sentence2': ["Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 ."],
 'label': [0],
 'idx': [1]}

查看数据的特征

In [43]:
raw_datasets['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

## 向量化(tensor)

> `dataset.map`方法结合tokenizer，可以把数据集合转换成tensor，输入到model。

In [44]:
def tokenizr_func(example):
    return tokenizer(example['sentence1'],example['sentence2'],
                   padding=True,
                   return_tensors='pt',
                   truncation=True,
                   max_length=128)

ds=raw_datasets.map(tokenizr_func,batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

删除无用的列，重命名labels的原因是 ,transformer认为labels的特征是目标.

In [45]:
ds=ds.remove_columns(['sentence1','sentence2','idx']).rename_column('label','labels')
ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

## 分割数据集

In [103]:
raw_datasets['train'].train_test_split(0.5)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1834
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1834
    })
})

## 从pandas获得dataset

In [105]:
from datasets import Dataset,DatasetDict
import pandas as pd

In [110]:
df=pd.read_csv('model_benchmark.csv')
ds=Dataset.from_pandas(df)
ds.features

{'model_name': Value(dtype='string', id=None),
 'learning_rate': Value(dtype='float64', id=None),
 'pool': Value(dtype='string', id=None),
 'dataset': Value(dtype='string', id=None),
 'GPU_mem': Value(dtype='float64', id=None),
 'error_rate': Value(dtype='float64', id=None),
 'valid_loss': Value(dtype='float64', id=None),
 'train_loss': Value(dtype='float64', id=None),
 'fit_time': Value(dtype='float64', id=None)}

## 动态与静态padding

In [49]:
from torch.utils.data import dataset, sampler, dataloader
from datasets import load_dataset,DatasetDict

In [54]:
#| export
def batch_dataset(ds:DatasetDict, #数据集
                  mapping_func #func(example,maxlen)签名的函数
                 ):
    """
    使用`mapping_func`作用于数据集ds上，把转换后的结果返回
    """
    ds=ds.map(padd_func,batched=True)
    ds=ds.remove_columns(['sentence1','sentence2','idx']).rename_column('label','labels')
      # dataset.map返回的数据集，sample是存放在list中的，加上这句话，把list转换成对应的tensor
    ds=ds.with_format('torch')
    return ds

Found cached dataset glue (/Users/zhanggxk/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
#| hide
raw_datasets = load_dataset("glue", "mrpc")

### 静态padding

In [None]:
#| export
def padding_fix(example,maxlen=512):
    """
    把example的句子通过`padding`填充的方式，转换成固定`maxlen`长度的tokens，然后返回。
    """
    ret= tokenizer(example['sentence1'],
                   example['sentence2'],
                   padding=True,
                   truncation=True,
                   max_length=maxlen)
    return ret

**用padding_fix处理好的数据集，每个batch,字符序列的长度都固定**

In [58]:
ds_train=batch_dataset(raw_datasets['train'],padding_fix)
dls=dataloader.DataLoader(ds_train,batch_size=50,shuffle=False)

for k,t in enumerate(dls):
    print(t['input_ids'].shape)
    if k==2:break

### 动态padding

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

torch.Size([50, 89])
torch.Size([50, 89])
torch.Size([50, 89])


### 动态padding

In [66]:
#| hide
from transformers import DataCollatorWithPadding

In [64]:
#| export
def padding_dynamic(example,maxlen=512):
    """
    按照example的句子按照句子的实际长度进行返回。
    """
    ret= tokenizer(example['sentence1'],example['sentence2'])
    return ret

**通过传递给`DataLoader` 对象收集器 `DataCollatorWithPadding`，每一个batch的长度都是动态被填充为batch中最长序列的。**

In [65]:
ds_train=batch_dataset(raw_datasets['train'],padding_dynamic)

#数据收集器
data_collector=DataCollatorWithPadding(tokenizer=tokenizer)
dls=dataloader.DataLoader(ds_train,batch_size=5,shuffle=False,collate_fn=data_collector)

for k,t in enumerate(dls):
    print(t['input_ids'].shape)
    if k==2:break

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

torch.Size([5, 67])
torch.Size([5, 62])
torch.Size([5, 61])


### DataCollatorWithPadding 
> data_collator接受一个map，把map中的list元素转换成tensor。如果list中的tensor长度不一，那么就会padding补齐。

In [68]:
# data_collator接受一个map，都没个把每个value(list)都转换成对于的tensor,如果value的元素长度参差不齐时，使用padding补齐后，在转成tensor
inp={
    'input_ids':[[1],[2,2],[3,3,3]]
}
data_collector=DataCollatorWithPadding(tokenizer=tokenizer)
batch=data_collector(inp)
batch['input_ids'],batch['attention_mask']

(tensor([[1, 0, 0],
         [2, 2, 0],
         [3, 3, 3]]),
 tensor([[1, 0, 0],
         [1, 1, 0],
         [1, 1, 1]]))

## 更多的数据集

### xsum: 总结概述训练集

In [None]:
d = load_dataset('xsum', split='train')

In [115]:
d[1:2]

{'document': ['A fire alarm went off at the Holiday Inn in Hope Street at about 04:20 BST on Saturday and guests were asked to leave the hotel.\nAs they gathered outside they saw the two buses, parked side-by-side in the car park, engulfed by flames.\nOne of the tour groups is from Germany, the other from China and Taiwan. It was their first night in Northern Ireland.\nThe driver of one of the buses said many of the passengers had left personal belongings on board and these had been destroyed.\nBoth groups have organised replacement coaches and will begin their tour of the north coast later than they had planned.\nPolice have appealed for information about the attack.\nInsp David Gibson said: "It appears as though the fire started under one of the buses before spreading to the second.\n"While the exact cause is still under investigation, it is thought that the fire was started deliberately."'],
 'summary': ['Two tourist buses have been destroyed by fire in a suspected arson attack in B

### AWS 评论数据集

In [None]:
d = load_dataset('amazon_us_reviews', 'Video_v1_00')

# 训练

## Transformer的训练流程

In [73]:
from transformers import Trainer,TrainingArguments,DataCollatorWithPadding
import numpy as np

1.使用动态padding的方法创建处理数据集

In [None]:
data=batch_dataset(raw_datasets,padding_dynamic)

2.定义好模型

In [None]:
model=AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-medium')

3.metric指标，函数签名要求返回一个`dict`

In [74]:
def compute_metrics(data):
    logits,labels=data
    
    pred_labels=np.argmax(logits,-1)
    acc=(pred_labels==labels).mean()
    
    return {"acc":acc}

def compute_metrics_mrpc(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    #这是evaluate返回函数的固定方法compute,根据预测与标注，返回dict（包含所有指标）
    return metric.compute(predictions=predictions, references=labels)

4.设置TrainingArguments,第一个参数表示要保存模型的路径

In [75]:
args=TrainingArguments('output',evaluation_strategy='epoch',)
trainer=Trainer(model,args,
                train_dataset=data['train'],
                eval_dataset=data['validation'],
                data_collator=DataCollatorWithPadding(tokenizer),
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
               )

In [172]:
trainer.train()



Epoch,Training Loss,Validation Loss,Acc
1,No log,0.399317,0.82598
2,0.509500,0.513949,0.833333
3,0.290800,0.70898,0.85049


TrainOutput(global_step=1377, training_loss=0.3337492572283, metrics={'train_runtime': 78.7102, 'train_samples_per_second': 139.804, 'train_steps_per_second': 17.495, 'total_flos': 120856103619216.0, 'train_loss': 0.3337492572283, 'epoch': 3.0})

5.预测测试数据集

In [78]:
### test dataset
test_predict=trainer.predict(data['test'])
print(f'predict shape:{test_predict.predictions.shape}')
print(f'label shape:{test_predict.label_ids.shape}')
test_predict.metrics

***** Running Prediction *****
  Num examples = 1725
  Batch size = 8


predict shape:(1725, 2)
label shape:(1725,)


{'test_loss': 0.6360893845558167,
 'test_acc': 0.6684057971014493,
 'test_runtime': 20.242,
 'test_samples_per_second': 85.219,
 'test_steps_per_second': 10.671}

# 常见问题

## transformer的模型，tokenizer，config都下载的本地的哪里呢？

In [83]:
! ls ~/.cache/huggingface/hub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[1m[36mmodels--bert-base-cased[m[m        tmpimfqc44k
[1m[36mmodels--prajjwal1--bert-medium[m[m version.txt


## load_dataset的参数含义？

In [None]:
# split表示加载训练 还是 测试数据集
load_dataset(dset_name,subset_name, split='train')

#补充
*   补充babi数据集
*   transformer与 pytorch模型训练

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()