In [1]:
from dataset import Dataset4SKEP
import paddle
from paddlenlp.datasets import MapDataset
import paddle.nn as nn

train_ds = Dataset4SKEP('train')
train_ds = MapDataset(train_ds)
dev_ds   = Dataset4SKEP('dev')
dev_ds = MapDataset(dev_ds)
test_ds  = Dataset4SKEP('test')
test_ds = MapDataset(test_ds)
print(train_ds[0:2])

[{'text': '5. Can regularly rinsing your nose with saline help prevent infection with the new coronavirus? 4. Can eating garlic help prevent infection with the new coronavirus? 6. Do vaccines against pneumonia protect you against the new coronavirus? 7. Can spraying alcohol or chlorine all over your body kill the new coronavirus? 8. How effective are thermal scanners in detecting people infected with the new coronavirus? 9. Can an ultraviolet disinfection lamp kill the new coronavirus? 10. Are hand dryers effective in killing the new coronavirus? 11. The new coronavirus CANNOT be transmitted through mosquito bites. 12. Taking a hot bath does not prevent the new coronavirus disease 13. Cold weather and snow CANNOT kill the new coronavirus. 14. COVID19 virus can be transmitted in areas with hot and humid climates 15. Drinking alcohol does not protect you against COVID19 and can be dangerous 16. Being able to hold your breath for 10 seconds or more without coughing or feeling discomfort D

In [2]:
from paddlenlp.transformers import SkepForSequenceClassification, SkepTokenizer

# load skep
model = SkepForSequenceClassification.from_pretrained(pretrained_model_name_or_path="skep_ernie_2.0_large_en", num_classes=2)
# tokenizer loaded
tokenizer = SkepTokenizer.from_pretrained(pretrained_model_name_or_path="skep_ernie_2.0_large_en")

[32m[2022-04-28 21:33:17,459] [    INFO][0m - Already cached C:\Users\46901\.paddlenlp\models\skep_ernie_2.0_large_en\skep_ernie_2.0_large_en.pdparams[0m
[32m[2022-04-28 21:33:28,492] [    INFO][0m - Already cached C:\Users\46901\.paddlenlp\models\skep_ernie_2.0_large_en\skep_ernie_2.0_large_en.vocab.txt[0m


In [3]:
import os
from functools import partial


import numpy as np
import paddle.nn.functional as F
from paddlenlp.data import Stack, Tuple, Pad

from utils import create_dataloader,convert_example


batch_size = 4
max_seq_length = 512

trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length)

batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # token_type_ids
    Stack()  # labels
): [data for data in fn(samples)]
train_data_loader = create_dataloader(
    train_ds,
    mode='train',
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func)
dev_data_loader = create_dataloader(
    dev_ds,
    mode='dev',
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func)


import time

from metrics import evaluate

epochs = 10
# save_dir
ckpt_dir = "skep_ckpt"
# step number
num_training_steps = len(train_data_loader) * epochs

optimizer = paddle.optimizer.AdamW(
    learning_rate=2e-5,
    parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

In [4]:
print('Total: ',num_training_steps)
global_step = 0
tic_train = time.time()
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, token_type_ids, labels = batch

        logits = model(input_ids, token_type_ids)

        loss = criterion(logits, labels)

        probs = F.softmax(logits, axis=1)

        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()

        global_step += 1
        if global_step % 200 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, acc,
                    10 / (time.time() - tic_train)))
            tic_train = time.time()

        loss.backward()
        optimizer.step()
        optimizer.clear_grad()

        if global_step % 200 == 0:
            save_dir = os.path.join(ckpt_dir, "model_%d" % global_step)
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)

            evaluate(model, criterion, metric, dev_data_loader)

            model.save_pretrained(save_dir)

            tokenizer.save_pretrained(save_dir)

Total:  4540
global step 200, epoch: 1, batch: 200, loss: 0.10253, accu: 0.78250, speed: 0.17 step/s
eval loss: 0.29538, accu: 0.87899,  F1: 0.8097, Precision: 0.8587, Recall: 0.7807
global step 400, epoch: 1, batch: 400, loss: 0.02642, accu: 0.87250, speed: 0.13 step/s
eval loss: 0.20402, accu: 0.91261,  F1: 0.8786, Precision: 0.8732, Recall: 0.8844
global step 600, epoch: 2, batch: 146, loss: 0.06386, accu: 0.92472, speed: 0.12 step/s
eval loss: 0.17779, accu: 0.93782,  F1: 0.9134, Precision: 0.9084, Recall: 0.9187
global step 800, epoch: 2, batch: 346, loss: 0.08065, accu: 0.94500, speed: 0.12 step/s
eval loss: 0.15562, accu: 0.94286,  F1: 0.9186, Precision: 0.9230, Recall: 0.9143
global step 1000, epoch: 3, batch: 92, loss: 0.01235, accu: 0.97491, speed: 0.12 step/s
eval loss: 0.14398, accu: 0.95126,  F1: 0.9314, Precision: 0.9303, Recall: 0.9325
global step 1200, epoch: 3, batch: 292, loss: 0.00579, accu: 0.97500, speed: 0.11 step/s
eval loss: 0.15965, accu: 0.94286,  F1: 0.9124, 

In [5]:
import numpy as np
from utils import convert_example

batch_size = 2 # limited by the memory of 3080
# process test data
trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    is_test=True)
batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
    Stack() # qid
): [data for data in fn(samples)]
test_data_loader = create_dataloader(
    test_ds,
    mode='test',
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func)

In [33]:
# choose model directory
params_path = './skep_ckpt//model_320/model_state.pdparams'
if params_path and os.path.isfile(params_path):
    # load model
    state_dict = paddle.load(params_path)
    model.set_dict(state_dict)
    print("Loaded parameters from %s" % params_path)
else:
    print("Model not found")

Loaded parameters from ./skep_ckpt//model_3200/model_state.pdparams


In [34]:
results = []

model.eval()
for batch in test_data_loader:
    input_ids, token_type_ids, qids = batch

    logits = model(input_ids, token_type_ids)
    probs = F.softmax(logits, axis=-1)
    idx = paddle.argmax(probs, axis=1).numpy()
    idx = idx.tolist()
    labels = [str(i) for i in idx]
    qids = qids.numpy().tolist()
    results.extend(zip(qids, labels))

In [35]:
res_dir = "./results"
if not os.path.exists(res_dir):
    os.makedirs(res_dir)

with open(os.path.join(res_dir, "SKEP.csv"), 'w+', encoding="utf8") as f:
    f.write("Id,Predicted\n")
    for qid, label in results:
        f.write(str(qid[0])+","+label+"\n")