In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4,5,6,7'

In [2]:
import torch
from transformers import LineByLineTextDataset
from models.nezha_model.modeling_nezha import NeZhaForSequenceClassification,NeZhaForMaskedLM,NeZhaPreTrainedModel
from models.nezha_model.configuration_nezha import NeZhaConfig

In [3]:
import numpy as np

# 数据预处理

In [4]:
train_path = './data/gaiic_track3_round1_train_20210228.tsv'
test_path = './data/gaiic_track3_round1_testA_20210228.tsv'

In [5]:
vocab_file = './models/tokens_ngram.txt'

In [6]:
raw_text = './data/raw_text_ngram.txt'

In [None]:
#统计词频
vocab_frequence = {}
with open(train_path, encoding="utf-8") as f:
    for line in f.read().splitlines():
        rows =  line.split('\t')[0:2]
        for sent in rows:
            for key in sent.split(' '):
                key = key.strip()
                vocab_frequence[key] = vocab_frequence.get(key,0)+1 
with open(test_path, encoding="utf-8") as f:
    for line in f.read().splitlines():
        for sent in rows:
            for key in sent.split(' '):
                key = key.strip()
                vocab_frequence[key] = vocab_frequence.get(key,0)+1

In [None]:
# mini_frequence = 3
# low_frequence_keys = set()
# new_vocab = dict()
# for key in vocab_frequence.keys():
#     if vocab_frequence[key]<= mini_frequence:
#         low_frequence_keys.add(key)
#     else:
#         new_vocab[key] =  vocab_frequence[key]

In [None]:
vocab_frequence = sorted(vocab_frequence.items(), key=lambda s: -s[1])

In [None]:
len(vocab_frequence)

In [None]:
nezha_orgin_vocab = []
with open('./models/nezha-base-wwm/vocab.txt', encoding="utf-8") as f:
    for line in f.read().splitlines():
        line = line.strip()
        if line != '你':
            nezha_orgin_vocab.append(line)
        else:
            break

In [None]:
vocab = nezha_orgin_vocab + [key[0] for key in vocab_frequence]

In [None]:
len(vocab)

# 生成mlm数据

In [None]:
#不删除低频词
def load_data_pair_sent(path,result):
    with open(path, encoding="utf-8") as f:
        for line in f.read().splitlines():
            rows =  line.split('\t')[0:2]
            a = []
            for key in rows[0].split(' '):
                key = key.strip()
                a.append(key)
            b = []
            for key in rows[1].split(' '):
                key = key.strip()
                b.append(key)
            result.append(' '.join(a) + ' [SEP] ' +' '.join(b))
            result.append(' '.join(b) + ' [SEP] ' +' '.join(a))

In [None]:
train_result =[]
test_result = []
load_data_pair_sent(train_path,train_result)
load_data_pair_sent(test_path,test_result)

all_result = train_result + test_result
with open(raw_text, 'w') as f:
    for key in all_result:
        f.write(str(key)+'\n')

In [None]:
with open(vocab_file,'w') as f:
    for key in vocab:
        f.write(str(key)+'\n')

# 开始训练

In [None]:
from transformers import BertTokenizer, LineByLineTextDataset
tokenizer = BertTokenizer(vocab_file=vocab_file)

In [None]:
dataset= LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = raw_text,
    block_size = 32  # maximum sequence length
)

In [None]:
config = NeZhaConfig.from_pretrained('./models/mynezha_ngram/config.json')

In [None]:
model = NeZhaForMaskedLM.from_pretrained('./models/mynezha_ngram', config=config)

In [None]:
# def set_only_allow_grad_names(model, layer_names ='emb', freeze=True):
#     for name, param in model.named_parameters():
#         if layer_names in name:
#             param.requires_grad = not freeze

In [None]:
# # 只允许更新embeding层
# set_only_allow_grad_names(model)

In [None]:
# ngram =3
# max_predictions_per_seq = 20
random_seed = 2021
np.random.seed(random_seed)

In [None]:
from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTrainedTokenizerBase

In [None]:
def _collate_batch(examples, tokenizer):
    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
    # Tensorize if necessary.
    if isinstance(examples[0], (list, tuple)):
        examples = [torch.tensor(e, dtype=torch.long) for e in examples]

    # Check if padding is necessary.
    length_of_first = examples[0].size(0)
    are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
    if are_tensors_same_length:
        return torch.stack(examples, dim=0)

    # If yes, check if we have a `pad_token`.
    if tokenizer._pad_token is None:
        raise ValueError(
            "You are attempting to pad samples but the tokenizer you are using"
            f" ({tokenizer.__class__.__name__}) does not have a pad token."
        )

    # Creating the full tensor and filling it with our data.
    max_length = max(x.size(0) for x in examples)
    result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
    for i, example in enumerate(examples):
        if tokenizer.padding_side == "right":
            result[i, : example.shape[0]] = example
        else:
            result[i, -example.shape[0] :] = example
    return result


In [None]:
class DataCollatorForLanguageModelingNgram:
    def __init__(self,tokenizer,mlm=True,mlm_probability=0.15):
        self.tokenizer = tokenizer
        self.mlm = mlm
        self.mlm_probability = mlm_probability

    def __call__(self, examples):
        # Handle dict or lists with proper padding and conversion to tensor.
        if isinstance(examples[0], (dict, BatchEncoding)):
            batch = self.tokenizer.pad(examples, return_tensors="pt")
        else:
            batch = {"input_ids": _collate_batch(examples, self.tokenizer)}

        # If special token mask has been preprocessed, pop it from the dict.
        special_tokens_mask = batch.pop("special_tokens_mask", None)
        if self.mlm:
            batch["input_ids"], batch["masked_lm_labels"] = self.mask_tokens(
                batch["input_ids"], special_tokens_mask=special_tokens_mask
            )
        else:
            labels = batch["input_ids"].clone()
            if self.tokenizer.pad_token_id is not None:
                labels[labels == self.tokenizer.pad_token_id] = -100
            batch["masked_lm_labels"] = labels
        return batch

    def mask_tokens(self, inputs, special_tokens_mask  = None):
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
        labels = inputs.clone()
        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        #是否变成2-gram。 0.4
        ngram_matrix = torch.full(labels.shape, 0.4)
        if special_tokens_mask is None:
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
            ]
            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        else:
            special_tokens_mask = special_tokens_mask.bool()
        
        
        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        
        masked_indices = torch.bernoulli(probability_matrix).bool()
        masked_backup = masked_indices.clone()
        ngram_indices = torch.bernoulli(ngram_matrix).bool()
        
        #判断是否把1-gram 变成2-gram
        for i in range(masked_indices.shape[0]):
            for j in range(masked_indices.shape[1]):
                if masked_backup[i][j] == True:
                    #需要mask的
                    if ngram_indices[i][j] == True:
                        #非特殊字符，才能变成2-gram
                        if j+1 < masked_indices.shape[1] and not special_tokens_mask[i][j+1]:
                            masked_indices[i][j+1] = True
        
        labels[~masked_indices] = -100  # We only compute loss on masked tokens

        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 10% of the time, we replace masked input tokens with random word
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]
        
        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        return inputs, labels

In [None]:
from transformers import BertConfig, BertForMaskedLM

data_collator = DataCollatorForLanguageModelingNgram(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./models/mynezha_ngram/',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=128,
    save_steps=5_000,
    save_total_limit=2,
    prediction_loss_only=True,
    learning_rate =  5e-5
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    
)

In [None]:
trainer.train()
trainer.save_model("./models/mynezha_ngram")
#loss 0.292800
#loss 0.433000
#loss 0.376000

In [None]:
trainer.save_model("./models/mynezha_ngram")

# 训练

In [7]:
from transformers import RobertaTokenizerFast,BertTokenizerFast,BertTokenizer
from torch.utils.data.dataset import Dataset
from transformers import DataCollatorForLanguageModeling,DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import RobertaTokenizerFast,BertTokenizer,RobertaTokenizer
from transformers import RobertaModel,BertModel
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader
import math

import torch
import numpy as np
import torch.nn as nn


In [8]:
train_path = './data/gaiic_track3_round1_train_20210228.tsv'
test_path = './data/gaiic_track3_round1_testA_20210228.tsv'

In [9]:
def load_data(path):
    max_length = 0
    samples = []
    labels = []
    with open(path, encoding="utf-8") as f:
        for line in f.read().splitlines():
            temp = line.split('\t')
            #new_line =temp[0] +' [SEP] ' + temp[1]
            samples.append([temp[0],temp[1]])
            if len(temp)>2:
                labels.append(int(temp[2]))
    return np.array(samples),np.array(labels),max_length

In [27]:
class MatchingDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels,tokenizer):
        self.texts = texts
        self.labels = labels
    def __getitem__(self, idx):
        text = self.texts[idx][0]+' [SEP] '+ self.texts[idx][1]
        examples = tokenizer(text, add_special_tokens=True, truncation=True, max_length=max_length)
        if isinstance(self.labels,np.ndarray):
            examples['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return examples
    def __len__(self):
        return len(self.texts)


In [11]:
train_texts,train_labels,max_length =  load_data(train_path)
test_texts, _, _ = load_data(test_path)

# 数据增强

In [12]:
import networkx as nx
pos_G = nx.Graph()
neg_G = nx.Graph()

In [13]:
for index,label in enumerate(train_labels):
    if label == 1:
        pos_G.add_edge(train_texts[index][0],train_texts[index][1])
    else:
        neg_G.add_edge(train_texts[index][0],train_texts[index][1])

In [14]:
pos_dataset = []
neg_dataset = []
for c in nx.connected_components(pos_G):
    nodes = list(c)
    for i in range(len(nodes)):
        for j in range(i+1,len(nodes)):
            pos_dataset.append([nodes[i],nodes[j]])
        if nodes[i] in neg_G:
            for neiber_neg in neg_G.adj[nodes[i]]:
                for z in range(len(nodes)):
                    if z != i:
                        neg_dataset.append([neiber_neg,nodes[z]])

In [21]:
train_texts_aug = pos_dataset + neg_dataset
train_labels_aug = [1]*len(pos_dataset)+ [0]*len(neg_dataset)

In [22]:
#合并原始的负样本
for index,label in enumerate(train_labels):
    if label == 0:
        train_texts_aug.append(train_texts[index])
        train_labels_aug.append(0)

In [23]:
train_texts_aug = np.array(train_texts_aug)
train_labels_aug = np.array(train_labels_aug)

In [24]:
f'正样本，原始:{(np.array(train_labels)>0).sum()},增强后:{(train_labels_aug>0).sum()}'

'正样本，原始:36436,增强后:53392'

In [25]:
f'负样本，原始:{(np.array(train_labels)==0).sum()},增强后:{(train_labels_aug==0).sum()}'

'负样本，原始:63564,增强后:73027'

In [26]:
train_texts_aug[0]

array(['12 23 25 6 26 27 19', '17 18 12 19 20 21 22 23 24'], dtype='<U236')

# 五折训练

In [29]:
from sklearn.model_selection import train_test_split,StratifiedKFold
import gc

In [30]:
tokenizer = BertTokenizer(vocab_file=vocab_file)
config = NeZhaConfig.from_pretrained('./models/mynezha_ngram/config.json',num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer)

In [31]:
test_dataset = MatchingDataset(test_texts,None,tokenizer)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False,collate_fn=data_collator)

In [32]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [33]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=0.5, emb_name='word_embeddings.'):
        
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings.'):
        # emb_name这个参数要换成你模型中embedding的参数名
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name: 
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

In [34]:
import time
def train(model,fgm,train_loader,optimizer,device):
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    for batch_index, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        #attention_mask = batch['attention_mask'].to(device)
        loss, logits  = model(input_ids,attention_mask,labels=labels)[:2]
        loss.backward()
        
        fgm.attack() 
        loss, logits  = model(input_ids,attention_mask,labels=labels)[:2]
        loss.backward()
        fgm.restore() # 恢复embedding参数
        
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

def evaluate(eval_model, val_loader,device):
    eval_model.eval() 
    y_pred = []
    y_true = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            output = eval_model(input_ids,attention_mask)[0]
            output = output.cpu().numpy()
            #output_score = np.exp(output[:, 1])/ (np.exp(output).sum(axis=1))
            output_score = np.exp(output)/ (np.exp(output).sum(axis=1,keepdims=True))
            y_pred.append(output_score[:, 1]/output_score.sum(axis=1))
            y_true.append(batch['labels'].numpy())
    y_pred = np.concatenate(y_pred)
    y_true = np.concatenate(y_true)
    return roc_auc_score(y_true, y_pred)

def predict_test(model,test_loader):
    model.eval() 
    y_pred = []
    with torch.no_grad():
        for batch_index, batch in enumerate(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            output = model(input_ids,attention_mask)[0]
            output = output.cpu().numpy()
            y_pred.append(output)
    y_pred = np.concatenate(y_pred)
    return y_pred

In [35]:
epochs = 5
lr = 1e-5 # learning rate

In [37]:
from sklearn.model_selection import StratifiedKFold
import gc
score = []
submit = np.zeros((len(test_texts), 2))
skf = StratifiedKFold(n_splits=5, random_state=1017, shuffle=True)
for i, (train_index, val_index) in enumerate(skf.split(train_texts_aug, train_labels_aug)):
    print("FOLD | ", i+1)
    print("###"*35)
    gc.collect()
    
    train_dataset = MatchingDataset(train_texts_aug[train_index],train_labels_aug[train_index],tokenizer)
    val_dataset = MatchingDataset(train_texts_aug[val_index],train_labels_aug[val_index],tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True,collate_fn=data_collator)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True,collate_fn=data_collator)
    
    
    model = NeZhaForSequenceClassification.from_pretrained('./models/mynezha_ngram', config=config)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    fgm = FGM(model)
    
    best_auc =0
    current_fold_path = f'./kfold/fold_{i}.pkl'
    for i in range(epochs):
        train(model,fgm,train_loader,optimizer,device)
        auc = evaluate(model,val_loader,device)
        print(f'epoch:{i},auc:{auc}')
        if auc >best_auc:
            best_auc = auc
            torch.save(model.state_dict(), current_fold_path)
    print(f'best auc:{best_auc}')
    model.load_state_dict(torch.load(current_fold_path))
    test_result = predict_test(model,test_loader)
    submit += test_result
    break

FOLD |  1
#########################################################################################################


Some weights of the model checkpoint at ./models/mynezha_ngram were not used when initializing NeZhaForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing NeZhaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NeZhaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of NeZhaForSequenceClassification were not initialized from the model checkpoint at ./models/mynezha_n

epoch:0,auc:0.9702477120210106
epoch:1,auc:0.9771391596525401
epoch:2,auc:0.9790498355755788


KeyboardInterrupt: 

In [None]:
submit.save('./kfold_result.npy')

In [None]:
test_loader

In [None]:
y_pred = np.zeros((len(test_texts), 2))
current_fold_path = f'./kfold/fold_{0}.pkl'
model.load_state_dict(torch.load(current_fold_path))
output = predict_test(model,test_loader)
output_score = np.exp(output)/ (np.exp(output).sum(axis=1,keepdims=True))
y_pred += output_score

In [None]:
y_pred = np.zeros((len(test_texts), 2))
for i in range(5):
    current_fold_path = f'./kfold/fold_{i}.pkl'
    model.load_state_dict(torch.load(current_fold_path))
    output = predict_test(model,test_loader)
    output_score = np.exp(output)/ (np.exp(output).sum(axis=1,keepdims=True))
    y_pred += output_score

In [None]:
y_pred = y_pred /5

In [None]:
y_pred = y_pred[:, 1]/y_pred.sum(axis=1)

In [None]:
y_pred

In [None]:
#f'train: {len(train_texts)},val:{len(val_texts)},test:{len(test_texts)}'

# test

In [None]:
with open('./result.txt', 'w') as f:
    for val in y_pred:
        f.write(str(val)+'\n')