第一步：安装和导入必要包

In [None]:
!pip install datasets transformers seqeval

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading d

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_dataset, load_metric, ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML
import numpy as np

第二步：原始数据加载

In [None]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [None]:
# 加载数据
datasets = load_dataset("conll2003")

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [None]:
# 字典
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
# 样例
datasets["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
# 标签
label_list = datasets["train"].features[f"ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
# 直观展示原始数据
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

show_random_elements(datasets["train"])

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,2265,"[Leading, second, round]","[JJ, JJ, NN]","[B-NP, I-NP, I-NP]","[O, O, O]"
1,6134,"[Two, other, girls, have, been, rescued, and, police, are, hunting, for, at, least, two, more, who, Dutroux, has, admitted, kidnapping, a, year, ago, .]","[CD, JJ, NNS, VBP, VBN, VBN, CC, NNS, VBP, VBG, IN, IN, JJS, CD, JJR, WP, NNP, VBZ, VBN, NN, DT, NN, RB, .]","[B-NP, I-NP, I-NP, B-VP, I-VP, I-VP, O, B-NP, B-VP, I-VP, B-PP, B-NP, I-NP, I-NP, I-NP, B-NP, I-NP, B-VP, I-VP, B-NP, B-NP, I-NP, B-ADVP, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-PER, O, O, O, O, O, O, O]"
2,3485,"[+1, Fred, Funk, through, 9]","[NNP, NNP, NNP, IN, CD]","[B-NP, I-NP, I-NP, B-PP, B-NP]","[O, B-PER, I-PER, O, O]"
3,7285,"[Kuperman, added, that, options, other, than, postponement, were, also, on, the, table, ,, but, she, refused, to, specify, what, they, were, .]","[NNP, VBD, IN, NNS, JJ, IN, NN, VBD, RB, IN, DT, NN, ,, CC, PRP, VBD, TO, VB, WP, PRP, VBD, .]","[B-NP, B-VP, B-SBAR, B-NP, B-ADJP, B-PP, B-NP, B-VP, B-ADVP, B-PP, B-NP, I-NP, O, O, B-NP, B-VP, I-VP, I-VP, B-NP, B-NP, B-VP, O]","[B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,12180,"[Average, prices, and, representative, prices, for, table, wines, at, the, various, marketing, centres, (, 96, /, C, 251/02, )]","[JJ, NNS, CC, NN, NNS, IN, NN, NNS, IN, DT, JJ, NN, VBZ, (, CD, SYM, NNP, NNP, )]","[B-NP, I-NP, O, B-NP, I-NP, B-PP, B-NP, I-NP, B-PP, B-NP, I-NP, I-NP, B-VP, O, B-NP, O, B-NP, I-NP, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
5,4882,"[LONDON, 1996-08-25]","[NNP, CD]","[B-NP, I-NP]","[B-LOC, O]"
6,13400,"[India, has, acquired, 120,000, tonnes, of, diesel, in, three, cargoes, ,, bound, for, the, west, coast, ,, in, its, October, tender, .]","[NNP, VBZ, VBN, CD, NNS, IN, NN, IN, CD, NNS, ,, JJ, IN, DT, JJ, NN, ,, IN, PRP$, NNP, NN, .]","[B-NP, B-VP, I-VP, B-NP, I-NP, B-PP, B-NP, B-PP, B-NP, I-NP, O, B-ADJP, B-PP, B-NP, I-NP, I-NP, O, B-PP, B-NP, I-NP, I-NP, O]","[B-LOC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
7,10221,"[Although, five, days, of, the, ban, were, suspended, until, January, 1, ,, Weaver, will, miss, next, month, 's, big, St, Leger, meeting, ,, including, the, ride, on, top, stayer, Double, Trigger, in, the, Doncaster, Cup, .]","[IN, CD, NNS, IN, DT, NN, VBD, VBN, IN, NNP, CD, ,, NNP, MD, VB, JJ, NN, POS, JJ, NNP, NNP, NN, ,, VBG, DT, NN, IN, JJ, NN, RB, NNP, IN, DT, NNP, NNP, .]","[B-SBAR, B-NP, I-NP, B-PP, B-NP, I-NP, B-VP, I-VP, B-PP, B-NP, I-NP, O, B-NP, B-VP, I-VP, B-NP, I-NP, B-NP, I-NP, I-NP, I-NP, I-NP, O, B-PP, B-NP, I-NP, B-PP, B-NP, I-NP, I-NP, B-ADJP, B-PP, B-NP, I-NP, I-NP, O]","[O, O, O, O, O, O, O, O, O, O, O, O, B-PER, O, O, O, O, O, O, B-ORG, I-ORG, O, O, O, O, O, O, O, O, B-PER, I-PER, O, O, B-MISC, I-MISC, O]"
8,10993,"[1260, -, The, Ghibellines, retook, the, city, of, Florence, from, the, Florentine, Guelfs, at, the, battle, of, Monte, Aperto, .]","[CD, :, DT, NNPS, VBD, DT, NN, IN, NNP, IN, DT, JJ, NNP, IN, DT, NN, IN, NNP, NNP, .]","[B-NP, I-NP, I-NP, I-NP, B-VP, B-NP, I-NP, B-PP, B-NP, B-PP, B-NP, I-NP, I-NP, B-PP, B-NP, I-NP, B-PP, B-NP, I-NP, O]","[O, O, O, B-MISC, O, O, O, O, B-LOC, O, O, B-LOC, I-LOC, O, O, O, O, B-LOC, I-LOC, O]"
9,4316,"[-, Iraq, denounces, violation, of, airspace, by, U.S., warplanes, .]","[:, NNP, VBZ, NN, IN, NN, IN, NNP, NNS, .]","[O, B-NP, B-VP, B-NP, B-PP, B-NP, B-PP, B-NP, I-NP, O]","[O, B-LOC, O, O, O, O, O, B-LOC, O, O]"


In [None]:
label_list = datasets["train"].features[f"{task}_tags"].feature.names

第三步：数据处理

In [None]:
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# 分词和labels对齐函数, 英文的单词在分词后最小单位为子词，这意味着输入的总token数相对于原始token数变长了，
# 这时候需要将原始的ner标签在新的tokens集上进行重新打标，重打标的过程也很简单，只需要将原来的标签的相应子词上进行拷贝即可。同时在开始符和结束符位置置成-100
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenize_and_align_labels(datasets['train'][0:1])

{'input_ids': [[101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]]}

In [None]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

第四步：构建模型

In [None]:
# 模型加载
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

# 模型超参
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    # push_to_hub=True,
)

# 数据批处理器
data_collator = DataCollatorForTokenClassification(tokenizer)

# 评估部分
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# 训练部分
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

The repository for seqeval contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/seqeval.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4867,0.371076,0.474035,0.54939,0.508938,0.899852
2,0.2066,0.175563,0.719632,0.770109,0.744015,0.951944
3,0.1712,0.149678,0.762669,0.808144,0.784748,0.959792


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=165, training_loss=0.44216727271224515, metrics={'train_runtime': 118.1434, 'train_samples_per_second': 356.541, 'train_steps_per_second': 1.397, 'total_flos': 723360821206086.0, 'train_loss': 0.44216727271224515, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.14967802166938782,
 'eval_precision': 0.762668918918919,
 'eval_recall': 0.8081440877055599,
 'eval_f1': 0.7847482483298028,
 'eval_accuracy': 0.9597915706864505,
 'eval_runtime': 5.2767,
 'eval_samples_per_second': 615.913,
 'eval_steps_per_second': 2.464,
 'epoch': 3.0}

In [None]:
# 各类别评估
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'LOC': {'precision': 0.7294041028980788,
  'recall': 0.8556149732620321,
  'f1': 0.7874846194410265,
  'number': 2618},
 'MISC': {'precision': 0.6946902654867256,
  'recall': 0.5101543460601138,
  'f1': 0.5882903981264638,
  'number': 1231},
 'ORG': {'precision': 0.6179729175215429,
  'recall': 0.7324902723735408,
  'f1': 0.670376140663254,
  'number': 2056},
 'PER': {'precision': 0.9313725490196079,
  'recall': 0.9393539881344759,
  'f1': 0.9353462422054479,
  'number': 3034},
 'overall_precision': 0.762668918918919,
 'overall_recall': 0.8081440877055599,
 'overall_f1': 0.7847482483298028,
 'overall_accuracy': 0.9597915706864505}

第五步：模型保存与加载

In [None]:
trainer.save_model('./my_model/')

In [None]:
model = AutoModelForTokenClassification.from_pretrained("./my_model/")

第六步：模型推理

In [None]:
from transformers import pipeline

# 加载训练好的模型和分词器
ner_pipeline = pipeline("ner", model="./my_model/", tokenizer="distilbert-base-uncased")

# 定义函数来预测实体
def predict_entities_with_pipeline(text):
    # 使用pipeline进行实体识别
    ner_results = ner_pipeline(text)

    # 提取实体及其类型
    entities = [(result['word'], label_list[int(result['entity'].split('_')[-1])]) for result in ner_results]

    return text, entities

# 测试函数
text = "Hugging Face is in new york!"
output_text, entities = predict_entities_with_pipeline(text)
print("输入文本:", output_text)
print("实体及其类型:", entities)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


输入文本: Hugging Face is in new york!
实体及其类型: [('hugging', 'O'), ('face', 'O'), ('is', 'O'), ('in', 'O'), ('new', 'B-LOC'), ('york', 'B-LOC'), ('!', 'O')]
