# 微调句子对分类模型


## 1. 加载数据集


In [8]:
from torch.utils.data import Dataset
import json


class AFQMC(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)

    def load_data(self, data_file):
        Data = {}
        with open(data_file, "rt") as f:
            for idx, line in enumerate(f):
                sample = json.loads(line.strip())
                Data[idx] = sample
        return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]


train_data = AFQMC("./data/afqmc_public/train.json")
valid_data = AFQMC("./data/afqmc_public/dev.json")

print(len(train_data))
print(train_data[0])

34334
{'sentence1': '蚂蚁借呗等额还款可以换成先息后本吗', 'sentence2': '借呗有先息到期还本吗', 'label': '0'}


In [None]:
from torch.utils.data import IterableDataset
import json


class IterableAFQMC(IterableDataset):
    def __init__(self, data_file):
        self.data_file = data_file

    def __iter__(self):
        with open(self.data_file, "rt") as f:
            for line in f:
                sample = json.loads(line.strip())
                yield sample


train_data = IterableAFQMC("./data/afqmc_public/train.json")
print(next(iter(train_data)))

### DataLoader


In [10]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def collote_fn(batch_samples):
    batch_sentence_1, batch_sentence_2 = [], []
    batch_label = []
    for sample in batch_samples:
        batch_sentence_1.append(sample["sentence1"])
        batch_sentence_2.append(sample["sentence2"])
        batch_label.append(int(sample["label"]))
    X = tokenizer(
        batch_sentence_1,
        batch_sentence_2,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    y = torch.tensor(batch_label)
    return X, y


train_dataloader = DataLoader(
    train_data, batch_size=4, shuffle=True, collate_fn=collote_fn
)
valid_dataloader = DataLoader(
    valid_data, batch_size=4, shuffle=False, collate_fn=collote_fn
)

batch_X, batch_y = next(iter(train_dataloader))
print("batch_X shape:", {k: v.shape for k, v in batch_X.items()})
print("batch_y shape:", batch_y.shape)
print(batch_X)
print(batch_y)

batch_X shape: {'input_ids': torch.Size([4, 26]), 'token_type_ids': torch.Size([4, 26]), 'attention_mask': torch.Size([4, 26])}
batch_y shape: torch.Size([4])
{'input_ids': tensor([[ 101, 6010, 6009,  955, 1446, 6820, 3621,  794,  865, 7583, 2140, 7027,
         2807, 1408,  102,  865, 7583, 1377,  809, 5632, 1220, 6820,  955, 1446,
         1408,  102],
        [ 101, 5709, 1446, 2128, 1059, 6371, 6395, 1927, 6571, 3221,  784,  720,
         2658, 1105,  102, 5709, 1446, 2990, 4850, 2128, 1059, 1927, 6571,  102,
            0,    0],
        [ 101,  955, 1446, 4509, 6435, 8024, 1914, 7270, 3198, 7313, 1168, 6572,
          102,  955, 1446, 8024, 6587, 3621, 1567, 3198,  952, 5543, 1168,  102,
            0,    0],
        [ 101, 6010, 6009,  955, 1446, 2458,  679,  749, 2582,  720, 1215,  102,
          711,  784,  720, 2769, 4638, 6010, 6009,  955, 1446,  679, 5543, 4509,
         6435,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,

# 2. 训练模型


In [None]:
import torch
from torch import nn
from transformers import AutoModel

device = "cuda" if torch.cuda.is_available() else "cpu"


class BertForPairwiseCLS(nn.Module):
    def __init__(self):
        super(BertForPairwiseCLS, self).__init__()
        self.bert_encoder = AutoModel.from_pretrained(checkpoint)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 2)

    def forward(self, x):
        bert_output = self.bert_encoder(**x)
        cls_vectors = bert_output.last_hidden_state[:, 0, :]
        cls_vectors = self.dropout(cls_vectors)
        logits = self.classifier(cls_vectors)
        return logits


model = BertForPairwiseCLS().to(device)
print(model)

In [11]:
from torch import nn
from transformers import AutoConfig
from transformers import BertPreTrainedModel, BertModel

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")


class BertForPairwiseCLS(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(config.hidden_size, 2)
        self.post_init()

    def forward(self, x):
        bert_output = self.bert(**x)
        cls_vectors = bert_output.last_hidden_state[:, 0, :]
        cls_vectors = self.dropout(cls_vectors)
        logits = self.classifier(cls_vectors)
        return logits


config = AutoConfig.from_pretrained(checkpoint)

model = BertForPairwiseCLS.from_pretrained(checkpoint, config=config)
print(model)

Using cpu device


Some weights of BertForPairwiseCLS were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForPairwiseCLS(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

### 优化模型参数


In [12]:
from tqdm import tqdm


def train_loop(dataloader, model, loss_fn, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f"loss: {0:>7f}")
    finish_step_num = (epoch - 1) * len(dataloader)

    model.train()

    for step, (X, y) in enumerate(dataloader, start=1):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(
            f"loss: {total_loss / (finish_step_num + step):>7f}"
        )
        progress_bar.update(1)

    return total_loss


def test_loop(dataloader, model, mode="Test"):
    assert mode in ["Valid", "Test"]
    size = len(dataloader.dataset)
    correct = 0

    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    correct /= size
    print(f"{mode} Accuracy: {(100*correct):>0.1f}%\n")

In [13]:
from transformers import AdamW, get_scheduler

leanring_rate = 1e-5
epoch_num = 3

loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=leanring_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num * len(train_dataloader),
    # 告诉调度器总共会有多少训练步骤，这样它就可以为每一步计算合适的学习率。
)

total_loss = 0
for t in range(epoch_num):
    print(f"Epoch {t+1}\n-------------------------------")
    # total_loss = train_loop(
    #     train_dataloader, model, loss_fn, optimizer, lr_scheduler, t + 1, total_loss
    # )
    test_loop(valid_dataloader, model, mode="Valid")
print("Done!")



Epoch 1
-------------------------------
Valid Accuracy: 46.2%

Epoch 2
-------------------------------
Valid Accuracy: 46.2%

Epoch 3
-------------------------------
Valid Accuracy: 46.2%

Done!
