In [1]:
import sys
sys.path.append('../')

In [2]:
import torch
from transformers import BertModel, XLNetModel
import torch.optim as optim
from utils import config
from utils.process_data import get_bert_dataloader, get_xlnet_dataloader


class BertCls(torch.nn.Module):
    def __init__(self):
        super(BertCls, self).__init__()
        self.bert = BertModel.from_pretrained(config.BERT_MODEL_PATH)
        self.liner = torch.nn.Sequential(
            torch.nn.BatchNorm1d(config.EMBEDDING_DIM * 2),
            torch.nn.Dropout(),
            torch.nn.Linear(config.EMBEDDING_DIM * 2, 1),
            torch.nn.Sigmoid()
        )
        self.weight1 = torch.zeros((config.BATCH_SIZE, 1, config.EMBEDDING_DIM), requires_grad=True)
        torch.nn.init.normal_(self.weight1, mean=0.0, std=1.0)
        self.weight2 = torch.zeros((config.BATCH_SIZE, 1, config.EMBEDDING_DIM), requires_grad=True)
        torch.nn.init.normal_(self.weight2, mean=0.0, std=1.0)

    def forward(self, ste1, ste2):
        ebd1, cls1 = self.bert(ste1)
        ebd2, cls2 = self.bert(ste2)
        ste_ebd1 = torch.bmm(self.weight1, ebd1).view(config.BATCH_SIZE, config.EMBEDDING_DIM)
        ste_ebd2 = torch.bmm(self.weight2, ebd2).view(config.BATCH_SIZE, config.EMBEDDING_DIM)
        # max_pool1, indices1 = torch.max(ebd1, dim=1)
        # max_pool2, indices2 = torch.max(ebd2, dim=1)
        conact = torch.cat((ste_ebd1, ste_ebd2), dim=1)
        out = self.liner(conact)
        return out


class BertClsJoint(torch.nn.Module):
    def __init__(self):
        super(BertClsJoint, self).__init__()
        self.bert = BertModel.from_pretrained(config.BERT_MODEL_PATH)
        self.liner = torch.nn.Sequential(
            torch.nn.BatchNorm1d(config.EMBEDDING_DIM),
            torch.nn.Dropout(),
            torch.nn.Linear(config.EMBEDDING_DIM, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, ste12):
        ebd1, cls1 = self.bert(ste12)
        cls = ebd1[:, 0, :]
        out = self.liner(cls)
        return out


class XLNetCls(torch.nn.Module):
    def __init__(self):
        super(XLNetCls, self).__init__()
        self.xlnet = XLNetModel.from_pretrained(config.XLNET_MODEL_PATH)
        self.liner = torch.nn.Sequential(
            torch.nn.BatchNorm1d(config.EMBEDDING_DIM * 2),
            torch.nn.Dropout(),
            torch.nn.Linear(config.EMBEDDING_DIM * 2, 256),
            torch.nn.BatchNorm1d(256),
            torch.nn.Dropout(),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, ste1, ste2):
        ebd1 = self.xlnet(ste1)[0]
        ebd2 = self.xlnet(ste2)[0]
        conact = torch.cat((ebd1[:, -1, :], ebd2[:, -1, :]), dim=1)
        out = self.liner(conact)
        return out


def freeze_parameter(cls_model):
    for n, p in cls_model.named_parameters():
        if 'bert' in n:
            p.requires_grad = False
    for n, p in cls_model.named_parameters():
        if 'bert.encoder.layer.11' in n:
            p.requires_grad = True


def train(model, train_data, test_data, epoch=30):
    loss_fn = torch.nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.00001)

    loss_sum = 0.7
    idx = 0
    for e in range(epoch):
        for td in train_data:
            optimizer.zero_grad()

            if config.JIONT:
                s1, l = td
                y = model(s1)
            else:
                s1, s2, l = td
                y = model(s1, s2)

            loss = loss_fn(y, l)
            loss.backward()
            optimizer.step()

            # 指数平均
            loss_sum = 0.9 * loss_sum + 0.1 * loss
            if idx % 100 == 99:
                test_loss = cal_loss(model, test_data)
                print('epoch:{} iter:{} loss:{} test_loss:{}'.format(e, idx, loss_sum, test_loss))
            idx += 1


def cal_loss(model, data):
    loss_sum = 0.7
    loss_fn = torch.nn.BCELoss()
    with torch.no_grad():
        for td in data:
            if config.JIONT:
                s1, l = td
                y = model(s1)
            else:
                s1, s2, l = td
                y = model(s1, s2)
            loss = loss_fn(y, l)
            loss_sum = 0.9 * loss_sum + 0.1 * loss
    return loss_sum


def evaluate(model, test_data):
    model.eval()
    right = 0.1
    preidt_p = 0.1
    positive = 0.1
    with torch.no_grad():
        for td in test_data:
            if config.JIONT:
                s1, l = td
                y = model(s1)
            else:
                s1, s2, l = td
                y = model(s1, s2)
            y = y.cpu().view(-1).numpy()
            y[y > 0.5] = 1
            y[y <= 0.5] = 0
            preidt_p += y.sum()

            l = l.cpu().view(-1).numpy()
            positive += l.sum()
            l[l == 0] = -1
            right += (y == l).sum()
    P = right / preidt_p
    R = right / positive
    F1 = 2 * P * R / (P + R)
    print('P:{} R:{} F1:{}'.format(P, R, F1))

In [3]:
cls_model = BertCls()

In [5]:
for n,p in cls_model.named_parameters():
    print(n,p.shape)

bert.embeddings.word_embeddings.weight torch.Size([21128, 768])
bert.embeddings.position_embeddings.weight torch.Size([512, 768])
bert.embeddings.token_type_embeddings.weight torch.Size([2, 768])
bert.embeddings.LayerNorm.weight torch.Size([768])
bert.embeddings.LayerNorm.bias torch.Size([768])
bert.encoder.layer.0.attention.self.query.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.self.query.bias torch.Size([768])
bert.encoder.layer.0.attention.self.key.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.self.key.bias torch.Size([768])
bert.encoder.layer.0.attention.self.value.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.self.value.bias torch.Size([768])
bert.encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.output.dense.bias torch.Size([768])
bert.encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
bert.encoder.layer.0.attention.output.LayerNorm.bias torch.Size([768])
bert.encoder

bert.encoder.layer.7.attention.self.value.bias torch.Size([768])
bert.encoder.layer.7.attention.output.dense.weight torch.Size([768, 768])
bert.encoder.layer.7.attention.output.dense.bias torch.Size([768])
bert.encoder.layer.7.attention.output.LayerNorm.weight torch.Size([768])
bert.encoder.layer.7.attention.output.LayerNorm.bias torch.Size([768])
bert.encoder.layer.7.intermediate.dense.weight torch.Size([3072, 768])
bert.encoder.layer.7.intermediate.dense.bias torch.Size([3072])
bert.encoder.layer.7.output.dense.weight torch.Size([768, 3072])
bert.encoder.layer.7.output.dense.bias torch.Size([768])
bert.encoder.layer.7.output.LayerNorm.weight torch.Size([768])
bert.encoder.layer.7.output.LayerNorm.bias torch.Size([768])
bert.encoder.layer.8.attention.self.query.weight torch.Size([768, 768])
bert.encoder.layer.8.attention.self.query.bias torch.Size([768])
bert.encoder.layer.8.attention.self.key.weight torch.Size([768, 768])
bert.encoder.layer.8.attention.self.key.bias torch.Size([768])

In [4]:
train_data, test_data = get_xlnet_dataloader()

In [5]:
cls_model = XLNetCls()
cls_model.cuda()

XLNetCls(
  (xlnet): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [6]:
for s1, s2, l in train_data:
    y = cls_model(s1, s2)
    break

NameError: name 'train_data' is not defined

In [13]:
y[0].shape

torch.Size([32, 50, 768])

In [6]:
from transformers import BertTokenizer, XLNetTokenizer
xlnet_tokenizer = XLNetTokenizer.from_pretrained(config.XLNET_MODEL_PATH)
bert_tokenizer = BertTokenizer.from_pretrained(config.BERT_MODEL_PATH)

In [8]:
s1='我爱你'
s2='我爱中国'
bert_tokenizer.encode_plus(s1,s2,pad_to_max_length=True, max_length=config.SENTENCE_MAX_LEN)

{'input_ids': [101,
  2769,
  4263,
  872,
  102,
  2769,
  4263,
  704,
  1744,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [36]:
s1='可以购买来回车票吗'
xltokens=xlnet_tokenizer.encode(s1,pad_to_max_length=True, max_length=config.SENTENCE_MAX_LEN)
berttokens=bert_tokenizer.encode(s1,pad_to_max_length=True, max_length=config.SENTENCE_MAX_LEN)

In [37]:
xlnet_tokenizer.decode(xltokens)

'<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad> 可以购买来回车票吗<sep><cls>'

In [38]:
bert_tokenizer.decode(berttokens)

'[CLS] 可 以 购 买 来 回 车 票 吗 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [31]:
a=torch.randn(32,50,768)

In [32]:
a

tensor([[[-1.2944, -0.3684,  0.7030,  ...,  1.1777, -0.3343,  1.2204],
         [ 0.7857, -1.7081, -0.7106,  ...,  0.1871,  0.2125, -1.0864],
         [ 0.8192, -1.1177,  1.2224,  ...,  0.0081, -0.3720,  0.2523],
         ...,
         [-1.6177, -0.1145,  1.2775,  ..., -0.5412, -0.5043,  0.4745],
         [ 0.0548, -0.8622, -0.7523,  ...,  1.1297,  0.7199,  0.2203],
         [ 0.1045,  1.1777,  0.7572,  ...,  0.5329,  0.1469, -0.5432]],

        [[-1.9565,  2.8182,  0.5367,  ..., -0.0983, -0.3800,  0.2389],
         [-0.7574, -1.0912, -0.7398,  ..., -0.7148,  0.0281, -0.3375],
         [-0.4940,  0.8061,  0.3485,  ..., -0.9352,  1.1697, -0.7024],
         ...,
         [ 0.2232, -0.6879,  1.6390,  ..., -0.6820,  0.4687,  1.2124],
         [ 0.9799, -1.1046,  0.8054,  ..., -1.0765,  0.2833, -0.9338],
         [-0.2537, -0.0226, -0.0772,  ..., -0.8614, -0.0285,  0.1079]],

        [[ 0.3912, -1.5073,  0.4322,  ..., -1.3465,  0.9196,  0.3431],
         [-0.2896,  1.8856, -0.3152,  ...,  0

In [38]:
c=torch.mean(a,dim=1)

In [40]:
c.shape

torch.Size([32, 768])

In [37]:
torch.mean?

In [55]:
A=torch.zeros((32,1,50),requires_grad=True)
A

tensor([[[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]]], requires_grad=True)

In [56]:
torch.nn.init.normal_(A, mean=0.0, std=1.0)

tensor([[[ 0.8458, -0.8852, -1.0602,  ...,  0.7175,  0.7173,  0.6936]],

        [[ 0.6601, -0.8741, -0.7984,  ..., -0.9880, -0.9221, -0.7217]],

        [[ 0.2087, -0.1518, -1.1936,  ...,  0.3879, -0.7212,  1.1874]],

        ...,

        [[-0.9784, -0.2496, -0.0850,  ...,  0.7513,  0.2761,  0.9163]],

        [[ 0.4896, -0.4279, -0.0028,  ..., -2.0456, -0.9899, -0.3772]],

        [[ 0.0078, -0.1378,  1.3204,  ..., -0.7012,  0.3273, -2.1293]]],
       requires_grad=True)

In [9]:
b=torch.randn((32,50,768))
c=torch.zeros((32,50,1))

In [15]:
d=b.mul(c)

In [16]:
d.sum(dim=1).shape

torch.Size([32, 768])

In [19]:
idf_w=torch.randn((50,))
idf_t = torch.tensor(idf_w)

  


In [26]:
torch.nn.functional.softmax?

In [27]:
a='123'
a[:50]

'123'

In [28]:
encoder_layer = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8)
src = torch.rand(10, 32, 512)
out = encoder_layer(src)

In [29]:
out.shape

torch.Size([10, 32, 512])