安装依赖

In [1]:
# !pip install mindspore==2.3.1 mindnlp==0.4.1
!pip install datasets emoji scikit-learn

Looking in indexes: http://mirrors.aliyun.com/pypi/simple/
Collecting emoji
  Downloading http://mirrors.aliyun.com/pypi/packages/91/db/a0335710caaa6d0aebdaa65ad4df789c15d89b7babd9a30277838a7d9aac/emoji-2.14.1-py3-none-any.whl (590 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[33mDEPRECATION: moxing-framework 2.1.16.2ae09d45 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of moxing-framework or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: emoji
Successfully installed emoji-2.14.1
[0m

In [2]:
import mindspore
mindspore.set_context(device_target='Ascend', device_id=0, pynative_synchronize=True)

加载数据集

In [None]:
from datasets import load_dataset

dataset = load_dataset("/tmp/code/hate_speech_twitter")
# dataset = load_dataset("thefrankhsu/hate_speech_twitter")
train_dataset = dataset['train'].remove_columns('categories').filter(lambda x: x['tweet'] is not None and x['label'] in [0, 1])
test_dataset = dataset['test'].remove_columns('categories').filter(lambda x: x['tweet'] is not None and x['label'] in [0, 1])

print("train dataset num_rows: ", train_dataset.num_rows)
print("test dataset num_rows: ", test_dataset.num_rows)
print(train_dataset.with_format("pandas")[:5])

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 5679 examples [00:00, 145319.43 examples/s]
Generating test split: 1000 examples [00:00, 72245.83 examples/s]
Filter: 100%|██████████| 5679/5679 [00:00<00:00, 122458.75 examples/s]
Filter: 100%|██████████| 1000/1000 [00:00<00:00, 68059.52 examples/s]

train dataset num_rows:  5678
test dataset num_rows:  1000
                                               tweet  label
0  krazy i dont always get drunk and pass out but...      0
1  white kids favorite activities calling people ...      1
2  maam did you clear that tweet with the   caref...      0
3  wth is that playing missy  i mean seriously rt...      0
4           he promised to stand with the muzzies so      0





加载模型

In [4]:
from mindnlp.transformers import BertweetTokenizer, AutoModelForSequenceClassification

tokenizer = BertweetTokenizer.from_pretrained("/tmp/code/bertweet-base")
# tokenizer = BertweetTokenizer.from_pretrained("vinai/bertweet-base")
model = AutoModelForSequenceClassification.from_pretrained("/tmp/code/bertweet-base", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=2)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.412 seconds.
Prefix dict has been built successfully.


[MS_ALLOC_CONF]Runtime config:  enable_vmm:True  vmm_align_size:2MB


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /tmp/code/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


数据预处理

In [None]:
import mindspore
from mindspore.dataset import GeneratorDataset, transforms

class HSTDataset:
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        if not isinstance(idx, int):
            idx = int(idx)
        return self.dataset[idx]['tweet'], self.dataset[idx]['label']

def process_dataset(source, tokenizer, max_seq_len=64, batch_size=32, shuffle=True, drop_remainder=False):
    is_ascend = mindspore.get_context('device_target') == 'Ascend'

    column_names = ["tweet", "label"]

    dataset = GeneratorDataset(source, column_names=column_names, shuffle=shuffle)
    # transforms
    type_cast_op = transforms.TypeCast(mindspore.int32)
    def tokenize_and_pad(text):
        if is_ascend:
            tokenized = tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_len)
        else:
            tokenized = tokenizer(text)
        return tokenized['input_ids'], tokenized['attention_mask']
    # map dataset
    dataset = dataset.map(operations=tokenize_and_pad, input_columns="tweet", output_columns=['input_ids', 'attention_mask'])
    dataset = dataset.map(operations=[type_cast_op], input_columns="label", output_columns='labels')
    # # batch dataset
    if is_ascend:
        dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
    else:
        dataset = dataset.padded_batch(batch_size, drop_remainder=drop_remainder, 
                                       pad_info={'input_ids': (None, tokenizer.pad_token_id),
                                                'attention_mask': (None, 0)})

    return dataset

train_dataset = process_dataset(HSTDataset(train_dataset), tokenizer, drop_remainder=True)
test_dataset = process_dataset(HSTDataset(test_dataset), tokenizer, shuffle=False)

In [6]:
print(next(train_dataset.create_dict_iterator()))

{'input_ids': Tensor(shape=[32, 64], dtype=Int64, value=
[[    0,    37,  1484 ...     1,     1,     1],
 [    0,   462, 10898 ...     1,     1,     1],
 [    0,   111,   112 ...     1,     1,     1],
 ...
 [    0,     6,    83 ...     1,     1,     1],
 [    0,   322,  1472 ...     1,     1,     1],
 [    0,   460, 51959 ...     1,     1,     1]]), 'attention_mask': Tensor(shape=[32, 64], dtype=Int64, value=
[[1, 1, 1 ... 0, 0, 0],
 [1, 1, 1 ... 0, 0, 0],
 [1, 1, 1 ... 0, 0, 0],
 ...
 [1, 1, 1 ... 0, 0, 0],
 [1, 1, 1 ... 0, 0, 0],
 [1, 1, 1 ... 0, 0, 0]]), 'labels': Tensor(shape=[32], dtype=Int32, value= [1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 
 0, 0, 0, 1, 0, 0, 0, 0])}


评估函数

In [7]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

设置训练参数

In [8]:
from mindnlp.engine import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bertweet_finetune/trainer_output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_strategy="epoch"
)

加载Trainer

In [9]:
from mindnlp.engine import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

训练

In [10]:
# Train the model
trainer.train()

print("Train over!")

  0%|          | 0/885 [00:00<?, ?it/s]

|

 20%|██        | 177/885 [03:14<10:29,  1.12it/s]

/

 20%|██        | 177/885 [03:15<10:29,  1.12it/s]

{'loss': 0.3052, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}



  0%|          | 0/32 [00:00<?, ?it/s][A
  6%|▋         | 2/32 [00:00<00:08,  3.59it/s][A
  9%|▉         | 3/32 [00:00<00:06,  4.67it/s][A
 12%|█▎        | 4/32 [00:00<00:05,  5.56it/s][A
 16%|█▌        | 5/32 [00:00<00:04,  6.25it/s][A
 19%|█▉        | 6/32 [00:01<00:03,  6.79it/s][A
 22%|██▏       | 7/32 [00:01<00:03,  7.17it/s][A
 25%|██▌       | 8/32 [00:01<00:03,  7.45it/s][A
 28%|██▊       | 9/32 [00:01<00:03,  7.60it/s][A
 31%|███▏      | 10/32 [00:01<00:02,  7.71it/s][A
 34%|███▍      | 11/32 [00:01<00:02,  7.78it/s][A
 38%|███▊      | 12/32 [00:01<00:02,  7.84it/s][A
 41%|████      | 13/32 [00:01<00:02,  7.88it/s][A
 44%|████▍     | 14/32 [00:02<00:02,  7.89it/s][A
 47%|████▋     | 15/32 [00:02<00:02,  7.92it/s][A
 50%|█████     | 16/32 [00:02<00:02,  7.98it/s][A
 53%|█████▎    | 17/32 [00:02<00:01,  8.00it/s][A
 56%|█████▋    | 18/32 [00:02<00:01,  7.97it/s][A
 59%|█████▉    | 19/32 [00:02<00:01,  7.97it/s][A
 62%|██████▎   | 20/32 [00:02<00:01,  7.98it/s]

{'eval_loss': 0.8967171907424927, 'eval_accuracy': 0.67, 'eval_precision': 0.9427083333333334, 'eval_recall': 0.362, 'eval_f1': 0.523121387283237, 'eval_runtime': 6.2447, 'eval_samples_per_second': 5.124, 'eval_steps_per_second': 0.16, 'epoch': 1.0}


 40%|████      | 354/885 [06:26<07:55,  1.12it/s]  

{'loss': 0.143, 'learning_rate': 1.2e-05, 'epoch': 2.0}



  0%|          | 0/32 [00:00<?, ?it/s][A
  6%|▋         | 2/32 [00:00<00:05,  5.53it/s][A
  9%|▉         | 3/32 [00:00<00:07,  3.88it/s][A
 12%|█▎        | 4/32 [00:01<00:08,  3.50it/s][A
 16%|█▌        | 5/32 [00:01<00:07,  3.72it/s][A
 19%|█▉        | 6/32 [00:01<00:05,  4.46it/s][A
 22%|██▏       | 7/32 [00:01<00:04,  5.14it/s][A
 25%|██▌       | 8/32 [00:01<00:04,  5.71it/s][A
 28%|██▊       | 9/32 [00:01<00:03,  6.20it/s][A
 31%|███▏      | 10/32 [00:01<00:03,  6.59it/s][A
 34%|███▍      | 11/32 [00:02<00:03,  6.95it/s][A
 38%|███▊      | 12/32 [00:02<00:02,  6.94it/s][A
 41%|████      | 13/32 [00:02<00:02,  7.18it/s][A
 44%|████▍     | 14/32 [00:02<00:02,  7.42it/s][A
 47%|████▋     | 15/32 [00:02<00:02,  7.61it/s][A
 50%|█████     | 16/32 [00:02<00:02,  7.51it/s][A
 53%|█████▎    | 17/32 [00:02<00:02,  7.44it/s][A
 56%|█████▋    | 18/32 [00:03<00:01,  7.48it/s][A
 59%|█████▉    | 19/32 [00:03<00:01,  7.53it/s][A
 62%|██████▎   | 20/32 [00:03<00:01,  7.63it/s]

{'eval_loss': 0.8762021064758301, 'eval_accuracy': 0.738, 'eval_precision': 0.9407407407407408, 'eval_recall': 0.508, 'eval_f1': 0.6597402597402598, 'eval_runtime': 5.5101, 'eval_samples_per_second': 5.808, 'eval_steps_per_second': 0.181, 'epoch': 2.0}


 60%|██████    | 531/885 [09:36<05:27,  1.08it/s]  

{'loss': 0.0963, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.0}



  0%|          | 0/32 [00:00<?, ?it/s][A
  6%|▋         | 2/32 [00:00<00:05,  5.69it/s][A
  9%|▉         | 3/32 [00:00<00:07,  3.84it/s][A
 12%|█▎        | 4/32 [00:01<00:08,  3.43it/s][A
 16%|█▌        | 5/32 [00:01<00:08,  3.20it/s][A
 19%|█▉        | 6/32 [00:01<00:06,  3.95it/s][A
 22%|██▏       | 7/32 [00:01<00:05,  4.67it/s][A
 25%|██▌       | 8/32 [00:01<00:04,  5.33it/s][A
 28%|██▊       | 9/32 [00:01<00:03,  5.88it/s][A
 31%|███▏      | 10/32 [00:02<00:03,  6.34it/s][A
 34%|███▍      | 11/32 [00:02<00:03,  6.74it/s][A
 38%|███▊      | 12/32 [00:02<00:02,  7.06it/s][A
 41%|████      | 13/32 [00:02<00:02,  7.27it/s][A
 44%|████▍     | 14/32 [00:02<00:02,  7.43it/s][A
 47%|████▋     | 15/32 [00:02<00:02,  7.47it/s][A
 50%|█████     | 16/32 [00:02<00:02,  7.48it/s][A
 53%|█████▎    | 17/32 [00:02<00:01,  7.51it/s][A
 56%|█████▋    | 18/32 [00:03<00:01,  7.49it/s][A
 59%|█████▉    | 19/32 [00:03<00:01,  7.54it/s][A
 62%|██████▎   | 20/32 [00:03<00:01,  7.62it/s]

{'eval_loss': 0.6897300481796265, 'eval_accuracy': 0.79, 'eval_precision': 0.9475308641975309, 'eval_recall': 0.614, 'eval_f1': 0.7451456310679612, 'eval_runtime': 5.406, 'eval_samples_per_second': 5.919, 'eval_steps_per_second': 0.185, 'epoch': 3.0}


 80%|████████  | 708/885 [12:44<02:43,  1.08it/s]

{'loss': 0.0635, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}



  0%|          | 0/32 [00:00<?, ?it/s][A
  6%|▋         | 2/32 [00:00<00:05,  5.44it/s][A
  9%|▉         | 3/32 [00:00<00:07,  3.92it/s][A
 12%|█▎        | 4/32 [00:01<00:08,  3.32it/s][A
 16%|█▌        | 5/32 [00:01<00:09,  2.99it/s][A
 19%|█▉        | 6/32 [00:01<00:07,  3.56it/s][A
 22%|██▏       | 7/32 [00:01<00:05,  4.23it/s][A
 25%|██▌       | 8/32 [00:01<00:05,  4.75it/s][A
 28%|██▊       | 9/32 [00:02<00:04,  5.42it/s][A
 31%|███▏      | 10/32 [00:02<00:03,  6.02it/s][A
 34%|███▍      | 11/32 [00:02<00:03,  6.54it/s][A
 38%|███▊      | 12/32 [00:02<00:02,  6.88it/s][A
 41%|████      | 13/32 [00:02<00:02,  7.19it/s][A
 44%|████▍     | 14/32 [00:02<00:02,  7.45it/s][A
 47%|████▋     | 15/32 [00:02<00:02,  7.54it/s][A
 50%|█████     | 16/32 [00:02<00:02,  7.70it/s][A
 53%|█████▎    | 17/32 [00:03<00:01,  7.82it/s][A
 56%|█████▋    | 18/32 [00:03<00:01,  7.71it/s][A
 59%|█████▉    | 19/32 [00:03<00:01,  7.85it/s][A
 62%|██████▎   | 20/32 [00:03<00:01,  7.96it/s]

{'eval_loss': 0.7547961473464966, 'eval_accuracy': 0.801, 'eval_precision': 0.943952802359882, 'eval_recall': 0.64, 'eval_f1': 0.7628128724672228, 'eval_runtime': 5.4116, 'eval_samples_per_second': 5.913, 'eval_steps_per_second': 0.185, 'epoch': 4.0}


100%|██████████| 885/885 [15:52<00:00,  1.13it/s]

{'loss': 0.0528, 'learning_rate': 0.0, 'epoch': 5.0}



  0%|          | 0/32 [00:00<?, ?it/s][A
  6%|▋         | 2/32 [00:00<00:05,  5.07it/s][A
  9%|▉         | 3/32 [00:00<00:08,  3.36it/s][A
 12%|█▎        | 4/32 [00:01<00:11,  2.40it/s][A
 16%|█▌        | 5/32 [00:01<00:10,  2.46it/s][A
 19%|█▉        | 6/32 [00:01<00:08,  3.19it/s][A
 22%|██▏       | 7/32 [00:02<00:06,  3.94it/s][A
 25%|██▌       | 8/32 [00:02<00:05,  4.64it/s][A
 28%|██▊       | 9/32 [00:02<00:04,  5.31it/s][A
 31%|███▏      | 10/32 [00:02<00:03,  5.79it/s][A
 34%|███▍      | 11/32 [00:02<00:03,  6.33it/s][A
 38%|███▊      | 12/32 [00:02<00:02,  6.70it/s][A
 41%|████      | 13/32 [00:02<00:02,  6.96it/s][A
 44%|████▍     | 14/32 [00:03<00:02,  7.15it/s][A
 47%|████▋     | 15/32 [00:03<00:02,  7.17it/s][A
 50%|█████     | 16/32 [00:03<00:02,  7.31it/s][A
 53%|█████▎    | 17/32 [00:03<00:02,  7.27it/s][A
 56%|█████▋    | 18/32 [00:03<00:01,  7.37it/s][A
 59%|█████▉    | 19/32 [00:03<00:01,  7.38it/s][A
 62%|██████▎   | 20/32 [00:03<00:01,  7.38it/s]

{'eval_loss': 0.9358894228935242, 'eval_accuracy': 0.77, 'eval_precision': 0.944078947368421, 'eval_recall': 0.574, 'eval_f1': 0.7139303482587064, 'eval_runtime': 5.9235, 'eval_samples_per_second': 5.402, 'eval_steps_per_second': 0.169, 'epoch': 5.0}


100%|██████████| 885/885 [16:17<00:00,  1.10s/it]

{'train_runtime': 977.8286, 'train_samples_per_second': 28.962, 'train_steps_per_second': 0.905, 'train_loss': 0.13215566516596045, 'epoch': 5.0}
Train over!





In [12]:
from mindspore import Tensor
def predict(text, label=None):
    label_map = {0: "Non Hate Speech", 1: "Hate Speech"}

    text_tokenized = Tensor([tokenizer(text).input_ids])
    logits = model(text_tokenized)
    predict_label = logits[0].asnumpy().argmax()
    info = f"inputs: '{text}', predict: '{label_map[predict_label]}'"
    if label is not None:
        info += f" , label: '{label_map[label]}'"
    print(info)
predict("on my way to fuck your bitch in the name of The Lord")

inputs: 'on my way to fuck your bitch in the name of The Lord', predict: 'Hate Speech'
