基于GPT-2预训练模型的prompt learning：通过人工定义prompt template与verbalizer，进行句子情感分类

In [1]:
import mindspore.ops as ops

from mindprompt.plms import load_plm
from mindprompt import PromptDataLoader
from mindprompt import PromptForClassification
from mindprompt.prompts import ManualTemplate
from mindprompt.prompts import ManualVerbalizer
from mindprompt.data_utils import InputExample

  from tqdm.autonotebook import tqdm


In [2]:
# 第一步：定义情感分析任务
classes = ["negative", "positive"] # label
dataset = [
    InputExample(
        guid=0,
        text_a="Albert Einstein was one of the greatest intellects of his time.",
    ),
    InputExample(
        guid=1,
        text_a="The film was badly made.",
    ),
    InputExample(
        guid=2,
        text_a="Kevin is a fantastic boy.",
    ),
    InputExample(
        guid=3,
        text_a="kitty is a bad boy.",
    ),
] # inputs [x] e.g., i love this movie 

In [3]:
print(dataset[0])
print(len(dataset))


{
  "guid": 0,
  "label": null,
  "meta": {},
  "text_a": "Albert Einstein was one of the greatest intellects of his time.",
  "text_b": "",
  "tgt_text": null
}

4


In [4]:
# 第二步：定义Pre-trained Language Models (PLMs)作为backbone.
plm, tokenizer, model_config, WrapperClass = load_plm("gpt2", "gpt2")

100%|██████████| 523M/523M [00:11<00:00, 46.0MB/s] 
[ERROR] ME(3116130:139764658304832,MainProcess):2023-06-07-21:05:47.810.405 [/home/daiyuxin/anaconda3/envs/xyr/lib/python3.7/site-packages/mindnlp/abc/mixins/special_tokens_mixin.py:304] Using pad_token, but it is not set yet.


In [5]:
# 第三步：定义模板Template
promptTemplate = ManualTemplate(
    text='{"placeholder":"text_a"} It was {"mask"}',
    tokenizer=tokenizer,
)

In [6]:
# 第四步：定义标签词映射Verbalizer
promptVerbalizer = ManualVerbalizer(
    classes=classes,
    label_words={
        "negative": ["bad"],
        "positive": ["good", "wonderful", "great"],
    },
    tokenizer=tokenizer,
)

In [16]:
# 补充：wrap 操作演示

print(f'input example: \n {dataset[0]}')

wrapped_example = promptTemplate.wrap_one_example(dataset[0])

print(f'wrapped example:')
for ele in wrapped_example[0]:
    print(ele)

input example: 
 {
  "guid": 0,
  "label": null,
  "meta": {},
  "text_a": "Albert Einstein was one of the greatest intellects of his time.",
  "text_b": "",
  "tgt_text": null
}

wrapped example:
{'text': 'Albert Einstein was one of the greatest intellects of his time.', 'loss_ids': 0, 'shortenable_ids': 1}
{'text': ' It was', 'loss_ids': 0, 'shortenable_ids': 0}
{'text': '<mask>', 'loss_ids': 1, 'shortenable_ids': 0}


In [17]:
# 补充： tokenize 操作演示
wrap_tokenizer = WrapperClass(max_seq_length=32, tokenizer=tokenizer)
tokenized_example = wrap_tokenizer.tokenize_one_example(wrapped_example, teacher_forcing=False)

[ERROR] ME(3116130:139764658304832,MainProcess):2023-06-07-21:17:53.776.034 [/home/daiyuxin/anaconda3/envs/xyr/lib/python3.7/site-packages/mindnlp/abc/mixins/special_tokens_mixin.py:328] Using mask_token, but it is not set yet.
[ERROR] ME(3116130:139764658304832,MainProcess):2023-06-07-21:17:53.777.046 [/home/daiyuxin/anaconda3/envs/xyr/lib/python3.7/site-packages/mindnlp/abc/mixins/special_tokens_mixin.py:328] Using mask_token, but it is not set yet.
[ERROR] ME(3116130:139764658304832,MainProcess):2023-06-07-21:17:53.777.392 [/home/daiyuxin/anaconda3/envs/xyr/lib/python3.7/site-packages/mindnlp/abc/mixins/special_tokens_mixin.py:293] Using sep_token, but it is not set yet.
[ERROR] ME(3116130:139764658304832,MainProcess):2023-06-07-21:17:53.777.741 [/home/daiyuxin/anaconda3/envs/xyr/lib/python3.7/site-packages/mindnlp/abc/mixins/special_tokens_mixin.py:293] Using sep_token, but it is not set yet.
[ERROR] ME(3116130:139764658304832,MainProcess):2023-06-07-21:17:53.778.060 [/home/daiyuxi

In [26]:
print(f'tokenized example: \n {tokenized_example}\n')

print('printing each key-value pair:')
for key, value in tokenized_example.items():
    print(f'{key}:{value}')

# input_ids 变回 tokens
print('\ninput_ids to tokens:')
tokens = []   
for id in tokenized_example['input_ids']:
    tokens.append(tokenizer.id_to_token(id))

print(' '.join(tokens))

tokenized example: 
 {'input_ids': [42590, 24572, 373, 530, 286, 262, 6000, 7654, 82, 286, 465, 640, 13, 632, 373, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257], 'loss_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'input_ids_len': 15}

printing each key-value pair:
input_ids:[42590, 24572, 373, 530, 286, 262, 6000, 7654, 82, 286, 465, 640, 13, 632, 373, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257]
loss_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
input_ids_len:15

input_ids to tokens:
Albert ĠEinstein Ġwas

In [27]:
# 第五步：封装PromptModel
promptModel = PromptForClassification(
    template=promptTemplate,
    plm=plm,
    verbalizer=promptVerbalizer,
)

In [28]:
# 第六步：定义dataloader
data_loader = PromptDataLoader(
    dataset=dataset,
    tokenizer=tokenizer,
    template=promptTemplate,
    tokenizer_wrapper_class=WrapperClass,
)

[ERROR] ME(3116130:139764658304832,MainProcess):2023-06-07-21:20:34.705.517 [/home/daiyuxin/anaconda3/envs/xyr/lib/python3.7/site-packages/mindnlp/abc/mixins/special_tokens_mixin.py:328] Using mask_token, but it is not set yet.
[ERROR] ME(3116130:139764658304832,MainProcess):2023-06-07-21:20:34.706.888 [/home/daiyuxin/anaconda3/envs/xyr/lib/python3.7/site-packages/mindnlp/abc/mixins/special_tokens_mixin.py:328] Using mask_token, but it is not set yet.
[ERROR] ME(3116130:139764658304832,MainProcess):2023-06-07-21:20:34.707.348 [/home/daiyuxin/anaconda3/envs/xyr/lib/python3.7/site-packages/mindnlp/abc/mixins/special_tokens_mixin.py:293] Using sep_token, but it is not set yet.
[ERROR] ME(3116130:139764658304832,MainProcess):2023-06-07-21:20:34.707.760 [/home/daiyuxin/anaconda3/envs/xyr/lib/python3.7/site-packages/mindnlp/abc/mixins/special_tokens_mixin.py:293] Using sep_token, but it is not set yet.
[ERROR] ME(3116130:139764658304832,MainProcess):2023-06-07-21:20:34.708.122 [/home/daiyuxi

In [29]:
# 第七步：训练与推理
promptModel.set_train(False)
for batch in data_loader.dataloader.create_tuple_iterator():
    logits = promptModel(batch)
    preds = ops.argmax(logits, dim=-1)
    print(classes[preds])

positive
positive
positive
positive
