In [1]:
TRAIN_SAMPLE_PATH = '/kaggle/input/scnuai-dataset/SCNUAI-dataset-v2/v2/train.txt'
DEV_SAMPLE_PATH = '/kaggle/input/scnuai-dataset/SCNUAI-dataset-v2/v2/dev.txt'
TEST_SAMPLE_PATH = '/kaggle/input/scnuai-dataset/SCNUAI-dataset-v2/v2/test.txt'

LABEL_PATH = '/kaggle/input/scnuai-dataset/SCNUAI-dataset-v2/v2/class.txt'

BERT_PAD_ID = 0
TEXT_LEN = 512

BERT_MODEL = '/kaggle/working/bert-base-chinese'
MODEL_DIR = '/kaggle/working/'

EMBEDDING_DIM = 768
NUM_FILTERS = 512
NUM_CLASSES = 34
FILTER_SIZES = [2, 3, 4]
BATCH_SIZE = 16
EPOCH = 100
LR = 2e-6

CLASS_LABELS = []
with open('/kaggle/input/scnuai-dataset/SCNUAI-dataset-v2/v2/class.txt', 'r') as f:
    CLASS_LABELS += [line.strip() for line in f.readlines()]

import torch

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

if __name__ == '__main__':
    print(torch.tensor([1,2,3]).to(DEVICE))

tensor([1, 2, 3], device='cuda:0')


In [2]:
!git lfs install
!git clone https://huggingface.co/bert-base-chinese

Error: Failed to call git rev-parse --git-dir: exit status 128 
Git LFS initialized.
Cloning into 'bert-base-chinese'...
remote: Enumerating objects: 52, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 52 (delta 0), reused 0 (delta 0), pack-reused 49[K
Unpacking objects: 100% (52/52), 158.38 KiB | 5.87 MiB/s, done.
Filtering content: 100% (4/4), 1.59 GiB | 99.67 MiB/s, done.


In [3]:
from torch.utils import data
import torch
from transformers import BertTokenizer
from sklearn.metrics import classification_report

from transformers import logging
logging.set_verbosity_error()

class Dataset(data.Dataset):
    def __init__(self, type='train'):
        super().__init__()
        if type == 'train':
            sample_path = TRAIN_SAMPLE_PATH
        elif type == 'dev':
            sample_path = DEV_SAMPLE_PATH
        elif type == 'test':
            sample_path = TEST_SAMPLE_PATH

        self.lines = open(sample_path, encoding='utf-8').readlines()
        self.tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, index):
        text, label = self.lines[index].split('\t')
        tokened = self.tokenizer(text)
        input_ids = tokened['input_ids']
        mask = tokened['attention_mask']
        if len(input_ids) < TEXT_LEN:
            pad_len = (TEXT_LEN - len(input_ids))
            input_ids += [BERT_PAD_ID] * pad_len
            mask += [0] * pad_len
        target = int(label)
        return torch.tensor(input_ids[:TEXT_LEN]), torch.tensor(mask[:TEXT_LEN]), torch.tensor(target)


def get_label():
    text = open(LABEL_PATH, encoding='utf-8').read()
    id2label = text.split()
    return id2label, {v: k for k, v in enumerate(id2label)}


def evaluate(pred, true, target_names=None, output_dict=False):
    return classification_report(
        true,
        pred,
        target_names=target_names,
        output_dict=output_dict,
        zero_division=0,
    )

if __name__ == '__main__':
    dataset = Dataset()
    loader = data.DataLoader(dataset, batch_size=2)
    print(iter(loader).__next__())



[tensor([[ 101,  122,  119,  ...,    0,    0,    0],
        [ 101, 3724, 8038,  ...,  711, 1398, 3333]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), tensor([18, 22])]


In [4]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from transformers import BertModel

class TextCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_MODEL)
        for name ,param in self.bert.named_parameters():
            param.requires_grad = False
        self.convs = nn.ModuleList([nn.Conv2d(1, NUM_FILTERS, (i, EMBEDDING_DIM)) for i in FILTER_SIZES])
        self.linear = nn.Linear(NUM_FILTERS * 3, NUM_CLASSES)

    def conv_and_pool(self, conv, input):
        out = conv(input)
        out = F.relu(out)
        return F.max_pool2d(out, (out.shape[2], out.shape[3])).squeeze()

    def forward(self, input, mask):
        out = self.bert(input, mask)[0].unsqueeze(1)
        out = torch.cat([self.conv_and_pool(conv, out) for conv in self.convs], dim=1)
        return self.linear(out)


if __name__ == '__main__':
    model = TextCNN()
    input = torch.randint(0, 3000, (2, TEXT_LEN))
    mask = torch.ones_like(input)
    print(model(input, mask).shape)


torch.Size([2, 34])


In [None]:
# train.py
if __name__ == '__main__':
    id2label, _ = get_label()

    train_dataset = Dataset('train')
    train_loader = data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    dev_dataset = Dataset('dev')
    dev_loader = data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=True)

#     model = TextCNN().to(DEVICE)
    model = torch.load('/kaggle/input/scnuai-models/v3-26.pth').to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    loss_fn = nn.CrossEntropyLoss()

    for e in range(EPOCH):
        for b, (input, mask, target) in enumerate(train_loader):
            input = input.to(DEVICE)
            mask = mask.to(DEVICE)
            target = target.to(DEVICE)

            pred = model(input, mask)
            loss = loss_fn(pred, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if b % 50 != 0:
                continue

            y_pred = torch.argmax(pred, dim=1)
            report = evaluate(y_pred.cpu().data.numpy(), target.cpu().data.numpy(), output_dict=True)

            with torch.no_grad():
                dev_input, dev_mask, dev_target = iter(dev_loader).__next__()
                dev_input = dev_input.to(DEVICE)
                dev_mask = dev_mask.to(DEVICE)
                dev_target = dev_target.to(DEVICE)
                dev_pred = model(dev_input, dev_mask)
                dev_pred_ = torch.argmax(dev_pred, dim=1)
                dev_report = evaluate(dev_pred_.cpu().data.numpy(), dev_target.cpu().data.numpy(), output_dict=True)

            print(
                '>> epoch:', e,
                'batch:', b,
                'loss:', round(loss.item(), 5),
                'train_acc:', report['accuracy'],
                'dev_acc:', dev_report['accuracy'],
            )
        if e % 1 == 0:
            torch.save(model, MODEL_DIR + f'{e}.pth')

>> epoch: 0 batch: 0 loss: 0.04633 train_acc: 1.0 dev_acc: 0.75
