In [1]:
!pip install transformers datasets tensorflow accelerate sentencepiece



In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset

insurance_corpus = load_dataset("Ddream-ai/InsuranceCorpus")

In [4]:
insurance_corpus["train"][0]

{'咨询': '投保中*人寿意外伤害险，被人打伤了，能不能得到理赔',
 '回复': '如果可以提供报警证明，证明自己是被人打了，那么可以凭报警证明，身份证，相关治疗的发票当保险公司进行理赔。针对非殴斗而是意外被袭,符合理赔条件的话，保险公司赔偿方式一般为：自意外伤害事故发生之日起一百八十日以内（含第一百八十日）所支出的合理医疗费用，在扣除一百元以后按百分之九十给付意外医疗保险金。特别注意的是意外伤害是指外来的、突发的、非本意的、非疾病的使身体受到伤害的客观事件。如果是寻衅殴斗中所受的意外伤害，这属于除外责任，保险人不应该承保的。'}

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")
zh_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
trans_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
text ='央视春晚，没有最烂，只有更烂'
tokenized_text = zh_tokenizer.prepare_seq2seq_batch([text], return_tensors='pt')
translation = trans_model.generate(**tokenized_text)
translated_text = zh_tokenizer.batch_decode(translation, skip_special_tokens=False)[0]

print(tokenized_text)
print(translated_text)

  return self.fget.__get__(instance, owner)()


{'input_ids': tensor([[    7, 61548,  4654, 20342,  5576,     2,   311,   971, 15660,     2,
          2042,   615, 15660,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
<pad> It's not the worst, it's the worst.</s>


`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



In [6]:
def translate(text):
    tokenized_text = tokenizer.prepare_seq2seq_batch([text], return_tensors='pt')
    translation = model.generate(**tokenized_text)
    translated_text = tokenizer.batch_decode(translation, skip_special_tokens=False)[0]
    print(translated_text)
    return translated_text

def preprocess_function(examples):
    flatten_list = [item for sublist in zip(examples['咨询'], examples['回复']) for item in sublist]
    return tokenizer(flatten_list)


In [7]:
tokenized_insurance_corpus = insurance_corpus.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=insurance_corpus["train"].column_names,
)

In [8]:
block_size = 512


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [9]:
lm_dataset = tokenized_insurance_corpus.map(group_texts, batched=True, num_proc=4)

In [10]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

2024-04-26 05:56:31.351903: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-26 05:56:31.371555: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Need to manually upload the `tokenizer.json` from "distilroberta-base" to my own hub

In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="my_insurance_mlm_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
    push_to_hub_model_id="my_insurance_mlm_model",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["validation"],
    data_collator=data_collator,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,1.3211,0.985923
2,1.1117,0.888674
3,1.0186,0.856245


TrainOutput(global_step=1932, training_loss=1.1134935303998044, metrics={'train_runtime': 321.7665, 'train_samples_per_second': 47.979, 'train_steps_per_second': 6.004, 'total_flos': 2047415450167296.0, 'train_loss': 1.1134935303998044, 'epoch': 3.0})

In [13]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 2.32


In [14]:
text = "My husband likes to drink a lot and I am worried about his health. What kind of critical illness <mask> can I buy?"

In [15]:
from transformers import pipeline

mask_filler = pipeline("fill-mask", "michaelfong2017/my_insurance_mlm_model")
mask_filler(text, top_k=3)

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

[{'score': 0.18883416056632996,
  'token': 2196,
  'token_str': ' drugs',
  'sequence': 'My husband likes to drink a lot and I am worried about his health. What kind of critical illness drugs can I buy?'},
 {'score': 0.18210262060165405,
  'token': 8456,
  'token_str': ' medication',
  'sequence': 'My husband likes to drink a lot and I am worried about his health. What kind of critical illness medication can I buy?'},
 {'score': 0.10659895092248917,
  'token': 6150,
  'token_str': ' medicine',
  'sequence': 'My husband likes to drink a lot and I am worried about his health. What kind of critical illness medicine can I buy?'}]