# 加载数据

In [1]:
from datasets import load_dataset, load_metric

In [2]:
task = 'cola'
model_checkpoint = 'distilbert-base-uncased'
batch_size = 512

In [3]:
actual_task = 'mnli' if task == 'mnli-mm' else task

In [4]:
dataset = load_dataset('glue', actual_task)

Reusing dataset glue (C:\Users\lyk\.cache\huggingface\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [5]:
metric = load_metric('glue', actual_task)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [7]:
dataset['train'][0]

{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1,
 'idx': 0}

In [8]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

In [9]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "can't pick more elements than there are in the dataset"
    # 不重复的num_examples
    picks = []
    for _ in  range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [10]:
show_random_elements(dataset['train'])

Unnamed: 0,sentence,label,idx
0,John deposited some money in the checking account and Mary did the same thing.,acceptable,3902
1,Steve pelted Anna acorns.,unacceptable,2791
2,I lent the book halfway to Tony.,unacceptable,2101
3,The pillow remained stuffed with feathers.,acceptable,2442
4,Her sister hurried.,acceptable,3407
5,John often kisses Mary.,acceptable,353
6,Martha carved the piece of wood from a branch into a toy.,unacceptable,2297
7,$50 won't even purchase a dress at Bloomingdale's.,acceptable,2753
8,Wanda taught the students French.,acceptable,3041
9,I played a tune on my iPod.,acceptable,5915


In [11]:
metric

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

In [12]:
import numpy as np

In [13]:
glue_metric = datasets.load_metric('glue', 'mrpc')

In [14]:
fake_preds = np.random.randint(0, 2, size=(64,))
fake_labels = np.random.randint(0, 2, size=(64, ))
glue_metric.compute(predictions=fake_preds, references=fake_labels)

{'accuracy': 0.421875, 'f1': 0.43076923076923074}

# 数据预处理

In [15]:
from transformers import AutoTokenizer

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [17]:
tokenizer("Hello, this one sentence!", "And this sentence goes with it.")

{'input_ids': [101, 7592, 1010, 2023, 2028, 6251, 999, 102, 1998, 2023, 6251, 3632, 2007, 2009, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [18]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [19]:
sentence_1_key, sentence_2_key = task_to_keys[task]
if sentence_2_key is None:
    print(f"Sentence:{dataset['train'][0][sentence_1_key]}")
else:
    print(f"Sentence 1:{dataset['train'][0][sentence_1_key]}")
    print(f"Sentence 2:{dataset['train'][0][sentence_2_key]}")

Sentence:Our friends won't buy this analysis, let alone the next one we propose.


In [20]:
def preprocess_function(examples):
    if sentence_2_key is None:
        return tokenizer(examples[sentence_1_key], truncation=True)
    return tokenizer(examples[sentence_1_key], examples[sentence_2_key], truncation=True)

In [21]:
preprocess_function(dataset['train'][:5])

{'input_ids': [[101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], [101, 2028, 2062, 18404, 2236, 3989, 1998, 1045, 1005, 1049, 3228, 2039, 1012, 102], [101, 2028, 2062, 18404, 2236, 3989, 2030, 1045, 1005, 1049, 3228, 2039, 1012, 102], [101, 1996, 2062, 2057, 2817, 16025, 1010, 1996, 13675, 16103, 2121, 2027, 2131, 1012, 102], [101, 2154, 2011, 2154, 1996, 8866, 2024, 2893, 14163, 8024, 3771, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [22]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

Loading cached processed dataset at C:\Users\lyk\.cache\huggingface\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-971b27846e2d7b59.arrow
Loading cached processed dataset at C:\Users\lyk\.cache\huggingface\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-ee0b403699890a59.arrow
Loading cached processed dataset at C:\Users\lyk\.cache\huggingface\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad\cache-b5676bd959c9335a.arrow


# 微调预训练模型

In [23]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [24]:
num_labels = 3 if task.startswith("mnli") else 1 if task=='stsb' else 2

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifi

In [26]:
metric_name = 'pearson' if task=='stsb' else 'matthews_correlation' if task=='cola' else 'accuracy'

In [27]:
args = TrainingArguments(
    'test-glue',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    report_to="none"
)

In [28]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != 'stsb':
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [29]:
validation_key = "validation_mismatched" if task == 'mnli-mm' else 'validation_matched' if task == 'mnli' else 'validation'

In [30]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.608817,0.0
2,No log,0.598182,0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


TrainOutput(global_step=34, training_loss=0.6087829926434685, metrics={'train_runtime': 817.5292, 'train_samples_per_second': 0.042, 'total_flos': 0, 'epoch': 2.0, 'init_mem_cpu_alloc_delta': 1095983104, 'init_mem_gpu_alloc_delta': 268953088, 'init_mem_cpu_peaked_delta': 287686656, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1092308992, 'train_mem_gpu_alloc_delta': 807649280, 'train_mem_cpu_peaked_delta': 24576, 'train_mem_gpu_peaked_delta': 8724788736})

In [32]:
trainer.evaluate()

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


{'eval_loss': 0.608816921710968,
 'eval_matthews_correlation': 0.0,
 'eval_runtime': 23.7043,
 'eval_samples_per_second': 44.0,
 'epoch': 2.0,
 'eval_mem_cpu_alloc_delta': 81920,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 45056,
 'eval_mem_gpu_peaked_delta': 605854208}

# 超参数搜索

In [33]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [34]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifi

In [None]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

[32m[I 2021-08-25 22:56:25,157][0m A new study created in memory with name: no-name-cc700918-e411-4d33-a7a0-8c37eb83f325[0m
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the 

Epoch,Training Loss,Validation Loss


In [None]:
best_run