In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Author  : nsytsqdtn
# @Blog    ：https://www.nsytsqdtn.cn
import pandas as pd
import numpy as np
import codecs
import collections
import re
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from transformers import BertForPreTraining, BertModel, BertTokenizer, AutoModel, AutoTokenizer
import warnings
import torch.nn as nn
from tqdm import tqdm
import random
import os
from torch.utils import data
from torch import nn
import torch.nn.functional as F
from torch.optim import *
torch.set_printoptions(edgeitems=768)
warnings.filterwarnings("ignore")
np.set_printoptions(threshold=np.inf)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BERT_MODEL_PATH = 'model'
# 设置基本参数
MAX_LEN = 50
BATCH_SIZE = 32
SEP_TOKEN_ID = 102
SEED=20210214
NAME = 'wwm'
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE=='cuda':
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
DEVICE

  from pandas import Panel


'cuda'

In [2]:
train_data = pd.read_csv('../data/gaiic_track3_round1_train_20210228.tsv', sep='\t', header=None)
train_data.rename(columns={0:'sen1', 1:'sen2', 2:'labels'}, inplace=True)
train_data['labels'] = train_data['labels'].astype(int)
train_data = train_data.dropna().reset_index(drop=True)
test_data = pd.read_csv('../data/gaiic_track3_round1_testB_20210317.tsv', sep='\t', header=None)
test_data.rename(columns={0:'sen1', 1:'sen2'}, inplace=True)
train_data.shape, test_data.shape

((100000, 3), (25000, 2))

In [3]:
train_dataset = []
for i in tqdm(range(len(train_data))):
    train_dict = {}
    train_dict['sen1'] = train_data.loc[i, 'sen1']
    train_dict['sen2'] = train_data.loc[i, 'sen2']
    train_dict['labels'] = train_data.loc[i, 'labels']
    train_dataset.append(train_dict)
test_dataset = []
for i in tqdm(range(len(test_data))):
    test_dict = {}
    test_dict['sen1'] = test_data.loc[i, 'sen1']
    test_dict['sen2'] = test_data.loc[i, 'sen2']
    test_dict['labels'] = None
    test_dataset.append(test_dict)

HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




In [4]:
class DataSet(data.Dataset):
    def __init__(self, data, mode='train'):
        self.data = data
        self.tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None)
        self.mode = mode
        self.dataset = self.get_data(self.data, self.tokenizer,self.mode)
        
    def get_data(self, data, tokenizer, mode):
        global x
        dataset = []
        for data_li in tqdm(data):
            sen1 = data_li['sen1']
            sen2 = data_li['sen2']
            labels = data_li['labels']
            token1 = tokenizer.tokenize(sen1)[:MAX_LEN-3]
            token2 = tokenizer.tokenize(sen2)[:MAX_LEN-len(token1)-3]
            token_ids = tokenizer.convert_tokens_to_ids(['[CLS]']+token1+['[SEP]']+token2+['[SEP]'])[:MAX_LEN]
            if len(token_ids) < MAX_LEN:
                token_ids += [0] * (MAX_LEN - len(token_ids))
            dataset_dict = {'sen1':sen1, 'sen2':sen2, 'token_ids':token_ids, 'labels':labels}
            dataset.append(dataset_dict)
        return dataset
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        data = self.dataset[idx]
        token_ids = torch.tensor(data['token_ids'])
        seg_ids = self.get_seg_ids(token_ids)
        if self.mode == 'test':
            return token_ids, seg_ids
        else:
            labels = torch.tensor(data['labels'])
            return token_ids, seg_ids, labels
    
    def get_seg_ids(self, ids):
        seg_ids = torch.zeros_like(ids)
        seg_idx = 0
        for i, e in enumerate(ids):
            seg_ids[i] = seg_idx
            if e == SEP_TOKEN_ID:
                seg_idx += 1
        max_idx = torch.nonzero(seg_ids == seg_idx)
        seg_ids[max_idx] = 0
        return seg_ids

def get_dataloader(dataset, mode):
    torchdata = DataSet(dataset, mode=mode)
    if mode == 'train':
        dataloader = torch.utils.data.DataLoader(torchdata, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, drop_last=True)
    elif mode == 'test':
        dataloader = torch.utils.data.DataLoader(torchdata, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, drop_last=False)
    elif mode == 'valid':
        dataloader = torch.utils.data.DataLoader(torchdata, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, drop_last=False)
    return dataloader, torchdata

# train_dataloader, train_torchdata = get_dataloader(train_dataset, mode='train')
# test_dataloader, test_torchdata = get_dataloader(test_dataset, mode='test')

In [5]:
def debug_label():
    train_dataloader, train_torchdata = get_dataloader(train_dataset, mode='train')
    for token_ids, seg_ids, labels in train_dataloader:
        print(token_ids)
        print(seg_ids)
        print(labels)
        break
# debug_label()

In [6]:
import logging
def get_logger(filename, verbosity=1, name=None):
    level_dict = {0: logging.DEBUG, 1: logging.INFO, 2: logging.WARNING}
    formatter = logging.Formatter(
        "[%(asctime)s][%(filename)s][line:%(lineno)d][%(levelname)s] %(message)s"
    )
    logger = logging.getLogger(name)
    logger.setLevel(level_dict[verbosity])
    fh = logging.FileHandler(filename, "w")
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    sh = logging.StreamHandler()
    sh.setFormatter(formatter)
    logger.addHandler(sh)
    logger.removeHandler(sh)
    return logger

In [7]:
class BERT_Model(nn.Module):
    def __init__(self):
        super(BERT_Model, self).__init__()
        self.hidden_size = 768
        self.bert = BertModel.from_pretrained(BERT_MODEL_PATH, output_hidden_states=True, return_dict=True)
        self.linear = nn.Linear(self.hidden_size, 1)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids, seg_ids, labels_ids=None):
        attention_mask = (input_ids > 0)
        out = self.bert(input_ids=input_ids, token_type_ids=seg_ids, attention_mask=attention_mask)
        seq_relationship_score = F.sigmoid(self.dropout(self.linear(out[1])))
        if labels_ids is not None:
            loss_fct = nn.BCELoss()
            loss = loss_fct(seq_relationship_score.view(-1,1).float(), labels_ids.view(-1,1).float())
            return loss
        else:
            return seq_relationship_score

class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=0.5, emb_name='bert.embeddings.word_embeddings.weight'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='bert.embeddings.word_embeddings.weight'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name: 
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}    

In [8]:
def validation_funtion(model, valid_dataloader, valid_torchdata, mode):
    model.eval()
    results = []
    true_label = []
    if mode != 'test':
        for i, (sen1, sen2, label_ids) in enumerate(tqdm(valid_dataloader)):
            output = model(sen1.to(DEVICE), sen2.to(DEVICE))
            results += list(output.detach().cpu()) 
            true_label += list(label_ids)
    else:
        for i, (sen1, sen2) in enumerate(tqdm(valid_dataloader)):
            output = model(sen1.to(DEVICE), sen2.to(DEVICE))
            results += list(output.detach().cpu())    
    if mode == 'valid':
        auc = roc_auc_score(true_label,results)
        acc = precision_score(true_label,[1 if i >= 0.5 else 0 for i in results])
        recall = recall_score(true_label, [1 if i >= 0.5 else 0 for i in results])
        f1 = f1_score(true_label, [1 if i >= 0.5 else 0 for i in results])
        return auc, acc, recall, f1
    else:
        return results
                            
def train(model, train_dataloader, valid_dataloader, valid_torchdata, epochs, early_stop=None):
    global logger
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
#     optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5)
    optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, amsgrad=True)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=3, T_mult=2)
    total_loss = []
    train_loss = []
    best_score = -np.inf
    no_improve = 0
    for epoch in range(epochs):
        model.train()
        fgm = FGM(model)
        bar = tqdm(train_dataloader)
        for i, (input_ids, seg_ids, label_ids) in enumerate(bar):
            optimizer.zero_grad()
            output = model(input_ids.to(DEVICE), seg_ids.to(DEVICE), label_ids.to(DEVICE))
            loss = output
            loss.backward()
            train_loss.append(loss.item())
            
            fgm.attack()
            loss_adv = model(input_ids.to(DEVICE), seg_ids.to(DEVICE), label_ids.to(DEVICE))
            loss_ad = loss_adv
            loss_ad.backward()
            fgm.restore()
            
            scheduler.step(epoch + i / len(train_dataloader))
            optimizer.step()
            bar.set_postfix(tloss=np.array(train_loss).mean())
        auc, accuracy, recall, f1 = validation_funtion(model, valid_dataloader, valid_torchdata, 'valid')
        print('train_loss: {:.5f}, auc: {:.5f}, accuracy: {:.5f}, recall: {:.5f}, f1_socre: {:.5f}\n'.format(train_loss[-1],auc,accuracy,recall,f1))
        logger.info('Epoch:[{}]\t auc={:.5f}\t accuracy={:.5f}\t recall={:.5f}\t f1_socre={:.5f}'.format(epoch,auc,accuracy,recall,f1))
        global model_num
        if early_stop:
            if auc > best_score:
                best_score = auc
                torch.save(model.state_dict(), '{}_model_{}.bin'.format(NAME, model_num))
            else:
                no_improve += 1
            if no_improve == early_stop:
                model_num += 1
                break
            if epoch == epochs-1:
                model_num += 1
        else:
            if epoch >= epochs-2:
                torch.save(model.state_dict(), '{}_model_{}.bin'.format(NAME, model_num))
                model_num += 1
    return best_score

In [9]:
FOLD = 10
kf = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=SEED)
model_num = 1
test_preds_total = collections.defaultdict(list)
logger = get_logger('{}.log'.format(NAME))
best_score = []
for i, (train_index, test_index) in enumerate(kf.split(np.arange(train_data.shape[0]), train_data.labels.values)):
    print(str(i+1), '-'*50)
    tra = [train_dataset[index] for index in train_index]
    val = [train_dataset[index] for index in test_index]
    train_dataloader, train_torchdata = get_dataloader(tra, mode='train')
    valid_dataloader, valid_torchdata = get_dataloader(val, mode='valid')
    model = BERT_Model()
    model.to(DEVICE)
    score = train(model,train_dataloader,
                    valid_dataloader,
                    valid_torchdata,
                    epochs=5)
    torch.cuda.empty_cache()
    best_score.append(score)
for i in range(FOLD):
    print('第{}折中，best auc: {}'.format(i+1, best_score[i]))
    print('-'*50)

In [10]:
# model_num = 21
model = BERT_Model()
model.to(DEVICE)
test_preds_total = []
test_dataloader, test_torchdata = get_dataloader(test_dataset, mode='test')
for i in range(1,model_num):
    model.load_state_dict(torch.load('{}_model_{}.bin'.format(NAME, i)))
    test_pred_results = validation_funtion(model, test_dataloader, test_torchdata, 'test')
    test_preds_total.append(test_pred_results)
test_preds_merge = np.sum(test_preds_total, axis=0) / (model_num-1)

Some weights of BertModel were not initialized from the model checkpoint at zjcmodel and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=782.0), HTML(value='')))




In [11]:
import os
f = open('2_submit.txt','w')
for x in test_preds_merge:
    f.write(str(x)+'\n')
f.close()