In [2]:
from dataset import Dataset4SKEP
from paddlenlp.datasets import MapDataset
import paddle.nn as nn



train_ds = Dataset4SKEP('train')
train_ds = MapDataset(train_ds)
dev_ds   = Dataset4SKEP('dev')
dev_ds = MapDataset(dev_ds)
test_ds  = Dataset4SKEP('test')
test_ds = MapDataset(test_ds)
print(train_ds[0:10])

  and should_run_async(code)


[{'text': 'can regularly rinsing your nose with saline help prevent infection with the new coronavirus? can eating garlic help prevent infection with the new coronavirus? covidmalaysia do vaccines against pneumonia protect you against the new coronavirus? can spraying alcohol or chlorine all over your body kill the new coronavirus? chamber how effective are thermal scanners in detecting people infected with the new coronavirus? can an ultraviolet disinfection lamp kill the new coronavirus? are hand dryers effective in killing the new coronavirus? the new coronavirus cannot be transmitted through mosquito bites. taking a hot bath does not prevent the new coronavirus disease cold weather and snow cannot kill the new coronavirus. covid virus can be transmitted in areas with hot and humid climates drinking alcohol does not protect you against covid and can be dangerous being able to hold your breath for seconds or more without coughing or feeling discomfort does not mean you are free from 

In [3]:
from paddlenlp.transformers import SkepForSequenceClassification, SkepTokenizer

# load skep
model = SkepForSequenceClassification.from_pretrained(pretrained_model_name_or_path="skep_ernie_2.0_large_en", num_classes=2)
# tokenizer loaded
tokenizer = SkepTokenizer.from_pretrained(pretrained_model_name_or_path="skep_ernie_2.0_large_en")

[32m[2022-04-25 21:36:17,387] [    INFO][0m - Already cached C:\Users\46901\.paddlenlp\models\skep_ernie_2.0_large_en\skep_ernie_2.0_large_en.pdparams[0m
[32m[2022-04-25 21:36:23,095] [    INFO][0m - Already cached C:\Users\46901\.paddlenlp\models\skep_ernie_2.0_large_en\skep_ernie_2.0_large_en.vocab.txt[0m


In [4]:
import os
from functools import partial


import numpy as np
import paddle
import paddle.nn.functional as F
from paddlenlp.data import Stack, Tuple, Pad

from utils import create_dataloader,convert_example


batch_size = 32
max_seq_length = 256

trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length)

batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # token_type_ids
    Stack()  # labels
): [data for data in fn(samples)]
train_data_loader = create_dataloader(
    train_ds,
    mode='train',
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func)
dev_data_loader = create_dataloader(
    dev_ds,
    mode='dev',
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func)


import time

from metrics import evaluate

epochs = 5
# save_dir
ckpt_dir = "skep_ckpt"
# step number
num_training_steps = len(train_data_loader) * epochs

optimizer = paddle.optimizer.AdamW(
    learning_rate=2e-5,
    parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

In [None]:
print('Total: ',num_training_steps)
global_step = 0
tic_train = time.time()
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, token_type_ids, labels = batch

        logits = model(input_ids, token_type_ids)

        loss = criterion(logits, labels)

        probs = F.softmax(logits, axis=1)

        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()

        global_step += 1
        if global_step % 500 == 0:
            print(
                "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                % (global_step, epoch, step, loss, acc,
                    10 / (time.time() - tic_train)))
            tic_train = time.time()

        loss.backward()
        optimizer.step()
        optimizer.clear_grad()

        if global_step % 1000 == 0:
            save_dir = os.path.join(ckpt_dir, "model_%d" % global_step)
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)

            evaluate(model, criterion, metric, dev_data_loader)

            model.save_pretrained(save_dir)

            tokenizer.save_pretrained(save_dir)

Total:  285


In [None]:
import numpy as np
from utils import convert_example

# process test data
trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    is_test=True)
batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
    Stack() # qid
): [data for data in fn(samples)]
test_data_loader = create_dataloader(
    test_ds,
    mode='test',
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func)

Error: Session cannot generate requests

In [None]:
# choose model directory
params_path = 'skep_ckp/model_5000/model_state.pdparams'
if params_path and os.path.isfile(params_path):
    # load model
    state_dict = paddle.load(params_path)
    model.set_dict(state_dict)
    print("Loaded parameters from %s" % params_path)

In [None]:
label_map = {0 : 'nonrumour', 1 : 'rumour'}
results = []

model.eval()
for batch in test_data_loader:
    input_ids, token_type_ids, qids = batch

    logits = model(input_ids, token_type_ids)

    probs = F.softmax(logits, axis=-1)
    idx = paddle.argmax(probs, axis=1).numpy()
    idx = idx.tolist()
    labels = [label_map[i] for i in idx]
    qids = qids.numpy().tolist()
    results.extend(zip(qids, labels))

In [None]:
res_dir = "./results"
if not os.path.exists(res_dir):
    os.makedirs(res_dir)

with open(os.path.join(res_dir, "SKEP.csv"), 'w', encoding="utf8") as f:
    f.write("Id,Predicted\n")
    for qid, label in results:
        f.write(str(qid[0])+","+label+"\n")