第一步：安装和导入必要包

In [None]:
!pip install datasets==1.18.3 transformers seqeval

In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_dataset, load_metric, ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML
import numpy as np

第二步：原始数据加载

In [3]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [4]:
# 加载数据
datasets = load_dataset("conll2003")

Downloading:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 959.94 KiB, generated: 9.78 MiB, post-processed: Unknown size, total: 10.72 MiB) to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee...


Downloading:   0%|          | 0.00/983k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
# 字典
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14042
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3251
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3454
    })
})

In [6]:
# 样例
datasets["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [7]:
# 标签
label_list = datasets["train"].features[f"ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [8]:
# 直观展示原始数据
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

show_random_elements(datasets["train"])

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,3825,"[standings, after, games, played, on, Friday, (, tabulate, under, won, ,]","[NNS, IN, NNS, VBN, IN, NNP, (, NN, IN, JJ, ,]","[B-NP, B-SBAR, B-NP, B-VP, B-PP, B-NP, O, B-NP, B-PP, B-NP, I-NP]","[O, O, O, O, O, O, O, O, O, O, O]"
1,3378,"[Results, from, the, $, 450,000, Toshiba, Classic, tennis, tournament, on, Saturday, (, prefix, number, denotes, seeding, ), :]","[NNS, IN, DT, $, CD, NNP, NNP, NN, NN, IN, NNP, (, JJ, NN, VBZ, NN, ), :]","[B-NP, B-PP, B-NP, I-NP, I-NP, I-NP, I-NP, I-NP, I-NP, B-PP, B-NP, O, B-NP, I-NP, B-VP, B-NP, O, O]","[O, O, O, O, O, B-MISC, I-MISC, O, O, O, O, O, O, O, O, O, O, O]"
2,7799,"[ISSUE, :, Public, School, ,, Series, 1996, TAX, STAT:Exempt-ULT]","[NNP, :, NNP, NNP, ,, NNP, CD, NN, NN]","[B-NP, O, B-NP, I-NP, O, B-NP, I-NP, I-NP, I-NP]","[O, O, O, O, O, O, O, O, O]"
3,3644,"[Elsewhere, ,, title, hopefuls, Liverpool, were, held, 0-0, at, home, by, newly-promoted, Sunderland, ,, and, in, London, ,, the, tie, between, Tottenham, Hotspur, and, Everton, also, ended, goaless, .]","[RB, ,, NN, NNS, NNP, VBD, VBN, JJ, IN, NN, IN, JJ, NN, ,, CC, IN, NNP, ,, DT, NN, IN, NNP, NNP, CC, NNP, RB, VBD, NNS, .]","[B-ADVP, O, B-NP, I-NP, I-NP, B-VP, I-VP, B-ADJP, B-PP, B-NP, B-PP, B-NP, I-NP, O, O, B-PP, B-NP, O, B-NP, I-NP, B-PP, B-NP, I-NP, I-NP, I-NP, B-ADVP, B-VP, B-NP, O]","[O, O, O, O, B-ORG, O, O, O, O, O, O, O, B-ORG, O, O, O, B-LOC, O, O, O, O, B-ORG, I-ORG, O, B-ORG, O, O, O, O]"
4,7980,"[-, English, langage, to, be, taught, as, of, the, eighth, year, of, the, primary, school, instead, of, the, third, year, of, the, secondary, school, .]","[:, JJ, NN, TO, VB, VBN, IN, IN, DT, JJ, NN, IN, DT, JJ, NN, RB, IN, DT, JJ, NN, IN, DT, JJ, NN, .]","[O, B-NP, I-NP, B-VP, I-VP, I-VP, B-PP, B-PP, B-NP, I-NP, I-NP, B-PP, B-NP, I-NP, I-NP, B-CONJP, I-CONJP, B-NP, I-NP, I-NP, B-PP, B-NP, I-NP, I-NP, O]","[O, B-MISC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
5,4559,"[MOTOCROSS, -, SWEDISH, 500CC, GRAND, PRIX, RESULTS, .]","[NNP, :, $, JJ, NNP, NNP, NNS, .]","[B-NP, O, B-NP, I-NP, I-NP, I-NP, I-NP, O]","[O, O, B-MISC, O, B-MISC, I-MISC, O, O]"
6,13991,"[The, 32-year-old, defender, played, seven, seasons, with, Nantes, and, was, with, Paris, St, Germain, for, five, seasons, .]","[DT, JJ, NN, VBD, CD, NNS, IN, NNPS, CC, VBD, IN, NNP, NNP, NNP, IN, CD, NNS, .]","[B-NP, I-NP, I-NP, B-VP, B-NP, I-NP, B-PP, B-NP, I-NP, B-VP, B-PP, B-NP, I-NP, I-NP, B-PP, B-NP, I-NP, O]","[O, O, O, O, O, O, O, B-ORG, O, O, O, B-ORG, I-ORG, I-ORG, O, O, O, O]"
7,3881,"[BASEBALL, -, MAJOR, LEAGUE, RESULTS, FRIDAY, .]","[NNP, :, NNP, NNP, NNP, NNP, .]","[B-NP, O, B-NP, I-NP, I-NP, I-NP, O]","[O, O, B-MISC, I-MISC, O, O, O]"
8,5970,"[Commanders, "", Vicente, "", and, "", Oscar, "", ,, guarded, by, a, dozen, EPR, gunmen, ,, said, in, an, interview, with, La, Jornada, outside, Mexico, City, that, the, armed, group, was, committed, to, overthrowing, the, government, .]","[NNPS, "", NNP, "", CC, "", NNP, "", ,, VBN, IN, DT, NN, NNP, NNS, ,, VBD, IN, DT, NN, IN, NNP, NNP, IN, NNP, NNP, IN, DT, JJ, NN, VBD, VBN, TO, VBG, DT, NN, .]","[B-NP, O, B-NP, O, O, O, B-NP, O, O, B-VP, B-PP, B-NP, I-NP, I-NP, I-NP, O, B-VP, B-PP, B-NP, I-NP, B-PP, B-NP, I-NP, B-PP, B-NP, I-NP, B-SBAR, B-NP, I-NP, I-NP, B-VP, I-VP, B-PP, B-VP, B-NP, I-NP, O]","[O, O, B-PER, O, O, O, B-PER, O, O, O, O, O, O, B-ORG, O, O, O, O, O, O, O, B-ORG, I-ORG, O, B-LOC, I-LOC, O, O, O, O, O, O, O, O, O, O, O]"
9,11104,"[Aid, agency, says, Sudan, missionaries, released, .]","[NN, NN, VBZ, NNP, NNS, VBN, .]","[B-NP, I-NP, B-VP, B-NP, I-NP, B-VP, O]","[O, O, O, B-LOC, O, O, O]"


In [9]:
label_list = datasets["train"].features[f"{task}_tags"].feature.names

第三步：数据处理

In [10]:
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
# 分词和labels对齐函数, 英文的单词在分词后最小单位为子词，这意味着输入的总token数相对于原始token数变长了，
# 这时候需要将原始的ner标签在新的tokens集上进行重新打标，重打标的过程也很简单，只需要将原来的标签的相应子词上进行拷贝即可。同时在开始符和结束符位置置成-100
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenize_and_align_labels(datasets['train'][0:1])

{'input_ids': [[101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]]}

In [12]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)



  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

第四步：构建模型

In [13]:
# 模型加载
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

# 模型超参
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    report_to='none'
    # push_to_hub=True,
)

# 数据批处理器
data_collator = DataCollatorForTokenClassification(tokenizer)

# 评估部分
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# 训练部分
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4238,0.33976,0.523639,0.589775,0.554743,0.914134
2,0.2048,0.175622,0.709385,0.761047,0.734308,0.950879
3,0.1671,0.152089,0.748686,0.796845,0.772015,0.957488


TrainOutput(global_step=165, training_loss=0.4051614143631675, metrics={'train_runtime': 63.5495, 'train_samples_per_second': 662.885, 'train_steps_per_second': 2.596, 'total_flos': 731504449007892.0, 'train_loss': 0.4051614143631675, 'epoch': 3.0})

In [14]:
trainer.evaluate()

{'eval_loss': 0.15208852291107178,
 'eval_precision': 0.7486861467311331,
 'eval_recall': 0.7968452847074616,
 'eval_f1': 0.7720153904514172,
 'eval_accuracy': 0.957488045498594,
 'eval_runtime': 2.9053,
 'eval_samples_per_second': 1119.007,
 'eval_steps_per_second': 4.475,
 'epoch': 3.0}

In [15]:
# 各类别评估
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'LOC': {'precision': 0.7136493795736557,
  'recall': 0.8567608861726509,
  'f1': 0.7786842562055198,
  'number': 2618},
 'MISC': {'precision': 0.5966850828729282,
  'recall': 0.4386677497969131,
  'f1': 0.5056179775280899,
  'number': 1231},
 'ORG': {'precision': 0.6097055163832434,
  'recall': 0.7149805447470817,
  'f1': 0.6581598388179987,
  'number': 2056},
 'PER': {'precision': 0.939443535188216,
  'recall': 0.9459459459459459,
  'f1': 0.9426835276728528,
  'number': 3034},
 'overall_precision': 0.7486861467311331,
 'overall_recall': 0.7968452847074616,
 'overall_f1': 0.7720153904514172,
 'overall_accuracy': 0.957488045498594}

第五步：模型保存与加载

In [16]:
trainer.save_model('./my_model/')

In [17]:
model = AutoModelForTokenClassification.from_pretrained("./my_model/")

第六步：模型推理

In [18]:
from transformers import pipeline

# 加载训练好的模型和分词器
ner_pipeline = pipeline("ner", model="./my_model/", tokenizer="distilbert-base-uncased")

# 定义函数来预测实体
def predict_entities_with_pipeline(text):
    # 使用pipeline进行实体识别
    ner_results = ner_pipeline(text)

    # 提取实体及其类型
    entities = [(result['word'], label_list[int(result['entity'].split('_')[-1])]) for result in ner_results]

    return text, entities

# 测试函数
text = "Hugging Face is in new york!"
output_text, entities = predict_entities_with_pipeline(text)
print("输入文本:", output_text)
print("实体及其类型:", entities)


Device set to use cuda:0


输入文本: Hugging Face is in new york!
实体及其类型: [('hugging', 'O'), ('face', 'O'), ('is', 'O'), ('in', 'O'), ('new', 'B-LOC'), ('york', 'B-LOC'), ('!', 'O')]
