In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4,5,6,7'

In [2]:
import torch
from transformers import LineByLineTextDataset
from modeling_nezha import NeZhaForSequenceClassification,NeZhaForMaskedLM,NeZhaPreTrainedModel
from configuration_nezha import NeZhaConfig

# 数据预处理

In [3]:
train_path = './data/gaiic_track3_round1_train_20210228.tsv'
test_path = './data/gaiic_track3_round1_testA_20210228.tsv'

In [4]:
vocab_file = './tokens.txt'

In [5]:
raw_text = './data/raw_text.txt'

In [None]:
#统计词频
vocab_frequence = {}
with open(train_path, encoding="utf-8") as f:
    for line in f.read().splitlines():
        rows =  line.split('\t')[0:2]
        for sent in rows:
            for key in sent.split(' '):
                key = key.strip()
                vocab_frequence[key] = vocab_frequence.get(key,0)+1 
with open(test_path, encoding="utf-8") as f:
    for line in f.read().splitlines():
        for sent in rows:
            for key in sent.split(' '):
                key = key.strip()
                vocab_frequence[key] = vocab_frequence.get(key,0)+1

In [None]:
mini_frequence = 3
low_frequence_keys = set()
new_vocab = dict()
for key in vocab_frequence.keys():
    if vocab_frequence[key]<= mini_frequence:
        low_frequence_keys.add(key)
    else:
        new_vocab[key] =  vocab_frequence[key]

In [None]:
new_vocab = sorted(new_vocab.items(), key=lambda s: -s[1])

In [None]:
nezha_orgin_vocab = []
with open('./nezha-base-wwm/vocab.txt', encoding="utf-8") as f:
    for line in f.read().splitlines():
        line = line.strip()
        if line != '你':
            nezha_orgin_vocab.append(line)
        else:
            break

In [None]:
vocab = nezha_orgin_vocab + [key[0] for key in new_vocab]

# 生成mlm数据

In [None]:
def load_data_pair_sent(path,result):
    with open(path, encoding="utf-8") as f:
        for line in f.read().splitlines():
            rows =  line.split('\t')[0:2]
            a = []
            for key in rows[0].split(' '):
                key = key.strip()
                if key in low_frequence_keys:
                    key = '[UNK]'
                a.append(key)
            b = []
            for key in rows[1].split(' '):
                key = key.strip()
                if key in low_frequence_keys:
                    key = '[UNK]'
                b.append(key)
            result.append(' '.join(a) + ' [SEP] ' +' '.join(b))
            result.append(' '.join(b) + ' [SEP] ' +' '.join(a))

In [None]:
train_result =[]
test_result = []
load_data_pair_sent(train_path,train_result)
load_data_pair_sent(test_path,test_result)

all_result = train_result + test_result
with open(raw_text, 'w') as f:
    for key in all_result:
        f.write(str(key)+'\n')

In [None]:
with open(vocab_file,'w') as f:
    for key in vocab:
        f.write(str(key)+'\n')

# 开始训练

In [None]:
from transformers import BertTokenizer, LineByLineTextDataset
tokenizer = BertTokenizer(vocab_file=vocab_file)

In [None]:
dataset= LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = raw_text,
    block_size = 128  # maximum sequence length
)

In [None]:
config = NeZhaConfig.from_pretrained('./nezha-base-wwm/config.json')

In [None]:
model = NeZhaForMaskedLM.from_pretrained('./mynazhe_min_freqence', config=config)

In [None]:
# def set_only_allow_grad_names(model, layer_names ='emb', freeze=True):
#     for name, param in model.named_parameters():
#         if layer_names in name:
#             param.requires_grad = not freeze

In [None]:
# # 只允许更新embeding层
# set_only_allow_grad_names(model)

In [None]:
from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTrainedTokenizerBase

In [None]:
def _collate_batch(examples, tokenizer):
    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
    # Tensorize if necessary.
    if isinstance(examples[0], (list, tuple)):
        examples = [torch.tensor(e, dtype=torch.long) for e in examples]

    # Check if padding is necessary.
    length_of_first = examples[0].size(0)
    are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
    if are_tensors_same_length:
        return torch.stack(examples, dim=0)

    # If yes, check if we have a `pad_token`.
    if tokenizer._pad_token is None:
        raise ValueError(
            "You are attempting to pad samples but the tokenizer you are using"
            f" ({tokenizer.__class__.__name__}) does not have a pad token."
        )

    # Creating the full tensor and filling it with our data.
    max_length = max(x.size(0) for x in examples)
    result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
    for i, example in enumerate(examples):
        if tokenizer.padding_side == "right":
            result[i, : example.shape[0]] = example
        else:
            result[i, -example.shape[0] :] = example
    return result

class DataCollatorForLanguageModeling:

    def __init__(self,tokenizer,mlm=True,mlm_probability=0.15):
        self.tokenizer = tokenizer
        self.mlm = mlm
        self.mlm_probability = mlm_probability

    def __call__(self, examples):
        # Handle dict or lists with proper padding and conversion to tensor.
        if isinstance(examples[0], (dict, BatchEncoding)):
            batch = self.tokenizer.pad(examples, return_tensors="pt")
        else:
            batch = {"input_ids": _collate_batch(examples, self.tokenizer)}

        # If special token mask has been preprocessed, pop it from the dict.
        special_tokens_mask = batch.pop("special_tokens_mask", None)
        if self.mlm:
            batch["input_ids"], batch["masked_lm_labels"] = self.mask_tokens(
                batch["input_ids"], special_tokens_mask=special_tokens_mask
            )
        else:
            labels = batch["input_ids"].clone()
            if self.tokenizer.pad_token_id is not None:
                labels[labels == self.tokenizer.pad_token_id] = -100
            batch["masked_lm_labels"] = labels
        return batch

    def mask_tokens(self, inputs, special_tokens_mask  = None):
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
        labels = inputs.clone()
        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        if special_tokens_mask is None:
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
            ]
            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        else:
            special_tokens_mask = special_tokens_mask.bool()

        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # We only compute loss on masked tokens

        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 10% of the time, we replace masked input tokens with random word
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        return inputs, labels

In [None]:
from transformers import BertConfig, BertForMaskedLM

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./mynazhe/',
    overwrite_output_dir=True,
    num_train_epochs=300,
    per_device_train_batch_size=128,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    learning_rate =  5e-6
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    
)

In [None]:
trainer.train()
trainer.save_model("./mynazhe_min_freqence")
#loss 0.292800

In [None]:
trainer.save_model("./mynazhe_min_freqence")

# 训练

In [6]:
from transformers import RobertaTokenizerFast,BertTokenizerFast,BertTokenizer
from torch.utils.data.dataset import Dataset
from transformers import DataCollatorForLanguageModeling,DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import RobertaTokenizerFast,BertTokenizer,RobertaTokenizer
from transformers import RobertaModel,BertModel
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader
import math

import torch
import numpy as np
import torch.nn as nn


In [7]:
train_path = './data/gaiic_track3_round1_train_20210228.tsv'
test_path = './data/gaiic_track3_round1_testA_20210228.tsv'

In [8]:

def load_data(path):
    max_length = 0
    samples = []
    labels = []
    with open(path, encoding="utf-8") as f:
        for line in f.read().splitlines():
            temp = line.split('\t')
            new_line =temp[0] +' [SEP] ' + temp[1]
            samples.append(new_line)
            if len(temp)>2:
                labels.append(int(temp[2]))
    return samples,labels,max_length

In [9]:
train_texts,train_labels,max_length =  load_data(train_path)
test_texts, _, _ = load_data(test_path)
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.1)

In [10]:
f'train: {len(train_texts)},val:{len(val_texts)}'

'train: 90000,val:10000'

In [11]:
tokenizer = BertTokenizer(vocab_file=vocab_file)
config = NeZhaConfig.from_pretrained('./mynazhe_min_freqence/config.json',num_labels=2)
bert = NeZhaForSequenceClassification.from_pretrained('./mynazhe_min_freqence', config=config)

Some weights of the model checkpoint at ./mynazhe_min_freqence were not used when initializing NeZhaForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing NeZhaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NeZhaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of NeZhaForSequenceClassification were not initialized from the model checkpoint at ./mynazhe_min_freq

In [12]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [13]:
class MatchingDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels,tokenizer):
        self.texts = texts
        self.labels = labels
    def __getitem__(self, idx):
        #item = tokenizer(self.texts[idx])
        examples = tokenizer(self.texts[idx], add_special_tokens=True, truncation=True, max_length=max_length)
        #examples = batch_encoding
        #examples = {"input_ids": sent_1['input_ids'],"input_ids_sent2":sent_2['input_ids']}
        if self.labels:
            examples['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return examples
    def __len__(self):
        return len(self.texts)


In [14]:
# class TextMatching(nn.Module):
#     def __init__(self, embeding_layer, embeding_size):
#         super(TextMatching, self).__init__()
#         self.embeding_layer = embeding_layer;
#         self.embeding_size = embeding_size
#         self.liner = nn.Linear(embeding_size, embeding_size)
#         #self.lstm =  nn.LSTM(embeding_size, embeding_size)
#         self.dropout = nn.Dropout(0.1)

#         self.liner2= nn.Linear(embeding_size, 1)
#         #self.criterion = torch.nn.BCEWithLogitsLoss()
#         self.sigmoid = nn.Sigmoid()
#     def forward(self, input_ids,attention_mask):
        
#         hidden = self.embeding_layer(input_ids =input_ids,attention_mask=attention_mask)
#         print(hidden)
#         #hidden = self.lstm(hidden.last_hidden_state)
#         hidden = self.dropout(hidden[1])
#         hidden = self.liner(hidden)
#         hidden = self.liner2(hidden)
#         output = self.sigmoid(hidden)
#         return output.squeeze()

In [15]:
#model = TextMatching(bert,768)
bert.to(device)

NeZhaForSequenceClassification(
  (bert): NeZhaModel(
    (embeddings): NeZhaEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): NeZhaEncoder(
      (layer): ModuleList(
        (0): NeZhaLayer(
          (attention): NeZhaAttention(
            (self): NeZhaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (relative_positions_encoding): RelativePositionsEncoding()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [16]:
train_dataset = MatchingDataset(train_texts,train_labels,tokenizer)
val_dataset = MatchingDataset(val_texts,val_labels,tokenizer)
data_collator = DataCollatorWithPadding(tokenizer)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True,collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True,collate_fn=data_collator)

In [17]:
#criterion = nn.BCELoss()
lr = 1e-5 # learning rate
#optimizer = torch.optim.SGD(model.parameters(), lr=lr)
optimizer = torch.optim.Adam(bert.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [18]:
# from torch.optim.lr_scheduler import LambdaLR

In [19]:
# def reduce_lr(epoch):
#     if epoch == 3:
#         return 0.1
#     return 1
# scheduler = LambdaLR(optimizer, lr_lambda=reduce_lr)


In [20]:
# class FGM():
#     def __init__(self, model):
#         self.model = model
#         self.backup = {}

#     def attack(self, epsilon=1., emb_name='emb.'):
#         # emb_name这个参数要换成你模型中embedding的参数名
#         for name, param in self.model.named_parameters():
#             if param.requires_grad and emb_name in name:
#                 self.backup[name] = param.data.clone()
#                 norm = torch.norm(param.grad)
#                 if norm != 0:
#                     r_at = epsilon * param.grad / norm
#                     param.data.add_(r_at)

#     def restore(self, emb_name='emb.'):
#         # emb_name这个参数要换成你模型中embedding的参数名
#         for name, param in self.model.named_parameters():
#             if param.requires_grad and emb_name in name: 
#                 assert name in self.backup
#                 param.data = self.backup[name]
#         self.backup = {}

In [21]:
#fgm = FGM(model)

In [34]:
import time
def train(epoch):
    bert.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    for batch_index, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        #attention_mask = batch['attention_mask'].to(device)
        loss, logits  = bert(input_ids,attention_mask,labels=labels)[:2]
        loss.backward()
        
#         fgm.attack() 
#         output_adv = model(input_ids,attention_mask)
#         loss_adv = criterion(output_adv, batch['labels'].to(device))
#         loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
#         fgm.restore() # 恢复embedding参数
        
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        total_loss += loss.item()
        log_interval = 200
        if batch_index % log_interval == 0 and batch_index > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.6f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch_index, len(train_loader) , scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))#scheduler.get_lr()[0]
            total_loss = 0
            start_time = time.time()


def evaluate(eval_model, val_loader):
    eval_model.eval() 
    y_pred = []
    y_true = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            output = eval_model(input_ids,attention_mask)[0]
            output = output.cpu().numpy()
            #output_score = np.exp(output[:, 1])/ (np.exp(output).sum(axis=1))
            output_score = np.exp(output)/ (np.exp(output).sum(axis=1,keepdims=True))
            y_pred.append(output_score[:, 1]/output_score.sum(axis=1))
            y_true.append(batch['labels'].numpy())
    y_pred = np.concatenate(y_pred)
    y_true = np.concatenate(y_true)
    return roc_auc_score(y_true, y_pred)

In [None]:
best_val_auc = float("inf")
epochs = 10
best_model = None
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_auc = evaluate(bert, val_loader)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid auc {:5.5f}  '.format(epoch, (time.time() - epoch_start_time),val_auc))
    print('-' * 89)

    if val_auc > best_val_auc:
        best_val_loss = val_auc
        best_model = model
    #scheduler.step()
    

In [35]:
evaluate(bert, val_loader)

0.9591093476755004

# test

In [37]:
test_dataset = MatchingDataset(test_texts,None,tokenizer)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False,collate_fn=data_collator)

In [42]:
bert.eval() 
y_pred = []
with torch.no_grad():
    for batch_index, batch in enumerate(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        output = bert(input_ids,attention_mask)[0]
        output = output.cpu().numpy()
        #output_score = np.exp(output[:, 1])/ (np.exp(output).sum(axis=1))
        output_score = np.exp(output)/ (np.exp(output).sum(axis=1,keepdims=True))
        y_pred.append(output_score[:, 1]/output_score.sum(axis=1))
y_pred = np.concatenate(y_pred)


In [44]:
with open('./result.txt', 'w') as f:
    for val in y_pred:
        f.write(str(val)+'\n')