In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Author  : nsytsqdtn
# @Blog    ：https://www.nsytsqdtn.cn
import pandas as pd
import numpy as np
import codecs
import collections
import re
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from transformers import BertForPreTraining, BertModel, BertTokenizer
import warnings
import torch.nn as nn
from bert4keras.tokenizers import Tokenizer
from tqdm import tqdm_notebook
from tqdm import tqdm
import random
import os
from torch.utils import data
from torch import nn
from torchcrf import CRF
import torch.nn.functional as F
from torch.optim import *
torch.set_printoptions(edgeitems=768)
tqdm.pandas()
warnings.filterwarnings("ignore")
np.set_printoptions(threshold=np.inf)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BERT_MODEL_PATH = '../../../pre_model/chinese_roberta_wwm_ext_pytorch/'
dict_path = '../../../pre_model/chinese_roberta_wwm_ext_pytorch/vocab.txt'
# 设置基本参数
MAX_LEN = 20
BATCH_SIZE = 8
SEP_TOKEN_ID = 102
SEED=2019
NAME = 'robert'
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE=='cuda':
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
DEVICE

Using TensorFlow backend.
  from pandas import Panel


device(type='cuda')

In [2]:
train_query = pd.read_csv('../../data/house/train/train.query.tsv', sep='\t', header=None)
train_query.rename(columns={0:'query_id', 1:'question'}, inplace=True)



train_reply = pd.read_csv('../../data/house/train/train.reply.tsv', sep='\t', header=None)
train_reply.rename(columns={0:'query_id', 1:'reply_id', 2:'answer', 3:'label'}, inplace=True)
train_data = train_query.merge(train_reply, on='query_id', how='left')

train_reply['answer'] = train_reply['answer'].astype('str')

test_query = pd.read_csv('../../data/house/test/test.query.tsv', sep='\t', header=None, encoding='GB18030')
test_query.rename(columns={0:'query_id', 1:'question'}, inplace=True)

test_reply = pd.read_csv('../../data/house/test/test.reply.tsv', sep='\t', header=None, encoding='GB18030')
test_reply.rename(columns={0:'query_id', 1:'reply_id', 2:'answer'}, inplace=True)
test_reply['answer'] = test_reply['answer'].astype('str')
print(train_query.head())
print(train_reply.head())

   query_id            question
0         0            采荷一小是分校吧
1         1                毛坯吗？
2         2  你们的佣金费大约是多少和契税是多少。
3         3             靠近川沙路嘛？
4         4      这套房源价格还有优惠空间吗？
   query_id  reply_id                       answer  label
0         0         0  杭州市采荷第一小学钱江苑校区，杭州市钱江新城实验学校。      1
1         0         1                           是的      0
2         0         2                         这是5楼      0
3         1         0                   因为公积金贷款贷的少      0
4         1         1                           是呢      0


In [3]:
train_dataset = []
for i in tqdm_notebook(range(len(train_query))):
    train_dict = {}
    train_dict['query_id'] = train_query.loc[i, 'query_id']
    train_dict['question'] = train_query.loc[i, 'question']
    train_qa = train_reply[train_reply['query_id'] == i].reset_index()
    reply_li = []
    for j in range(len(train_qa)):
        reply_dict = {}
        reply_id = train_qa.loc[j, 'reply_id']
        answer = train_qa.loc[j, 'answer']
        label = train_qa.loc[j, 'label']
        reply_dict['reply_id'] = reply_id
        reply_dict['reply'] = answer
        reply_dict['label'] = label
        reply_li.append(reply_dict)
    train_dict['all_reply'] = reply_li
    train_dataset.append(train_dict)
len(train_dataset)

HBox(children=(FloatProgress(value=0.0, max=6000.0), HTML(value='')))




6000

In [4]:
test_dataset = []
for i in tqdm_notebook(range(len(test_query))):
    test_dict = {}
    test_dict['query_id'] = test_query.loc[i, 'query_id']
    test_dict['question'] = test_query.loc[i, 'question']
    test_qa = test_reply[test_reply['query_id'] == i].reset_index()
    reply_li = []
    for j in range(len(test_qa)):
        reply_dict = {}
        reply_id = test_qa.loc[j, 'reply_id']
        answer = test_qa.loc[j, 'answer']
        reply_dict['reply_id'] = reply_id
        reply_dict['reply'] = answer
        reply_li.append(reply_dict)
    test_dict['all_reply'] = reply_li
    test_dataset.append(test_dict)
test_dataset[0]

HBox(children=(FloatProgress(value=0.0, max=14000.0), HTML(value='')))




{'query_id': 0,
 'question': '东区西区？什么时候下证？',
 'all_reply': [{'reply_id': 0, 'reply': '我在给你发套'},
  {'reply_id': 1, 'reply': '您看下我发的这几套'},
  {'reply_id': 2, 'reply': '这两套也是金源花园的'},
  {'reply_id': 3, 'reply': '价钱低'},
  {'reply_id': 4, 'reply': '便宜的房子，一般都是顶楼'}]}

In [5]:
class DataSet(data.Dataset):
    def __init__(self, data, mode='train'):
        self.data = data
        self.tokenizer = Tokenizer(dict_path, do_lower_case=True)
        self.mode = mode
        self.dataset = self.get_data(self.data, self.tokenizer,self.mode)
        
    def get_data(self, data, tokenizer, mode):
        dataset = []
        for data_li in tqdm_notebook(data):
            query_id = data_li['query_id']
            question = data_li['question']
            all_reply = data_li['all_reply']
            question_tokens = tokenizer.tokenize(question)
            for reply_li in all_reply:
                reply_id = reply_li['reply_id']
                reply = reply_li['reply']
                reply_tokens = tokenizer.tokenize(reply)[1:]
                qa_token = (question_tokens + reply_tokens)
                qa_token = qa_token[:MAX_LEN]
                qa_token_ids = tokenizer.tokens_to_ids(qa_token)
                if len(qa_token_ids) < MAX_LEN:
                    qa_token_ids += [0] * (MAX_LEN-len(qa_token_ids))
                labels = None
                if mode != 'test':
                    labels = reply_li['label']
                dataset_dict = {'query_id':query_id, 'question':question, 'reply_id':reply_id, 'reply':reply,
                            'token_ids':qa_token_ids, 'labels': labels}
                dataset.append(dataset_dict)
        return dataset
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        data = self.dataset[idx]
        token_ids = torch.tensor(data['token_ids'])
        seg_ids = self.get_seg_ids(token_ids)
        if self.mode == 'test':
            return token_ids, seg_ids
        else:
            labels = torch.tensor(data['labels'])
            return token_ids, seg_ids, labels
    
    def get_seg_ids(self, ids):
        seg_ids = torch.zeros_like(ids)
        seg_idx = 0
        for i, e in enumerate(ids):
            seg_ids[i] = seg_idx
            if e == SEP_TOKEN_ID:
                seg_idx += 1
        max_idx = torch.nonzero(seg_ids == seg_idx)
        seg_ids[max_idx] = 0
        return seg_ids

def get_dataloader(dataset, mode):
    torchdata = DataSet(dataset, mode=mode)
    if mode == 'train':
        dataloader = torch.utils.data.DataLoader(torchdata, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, drop_last=True)
    elif mode == 'test':
        dataloader = torch.utils.data.DataLoader(torchdata, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, drop_last=False)
    elif mode == 'valid':
        dataloader = torch.utils.data.DataLoader(torchdata, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, drop_last=False)
    return dataloader, torchdata

# train_dataloader, train_torchdata = get_dataloader(train_dataset, mode='train')
# test_dataloader, test_torchdata = get_dataloader(test_dataset, mode='test')
# train_torchdata.get_seg_ids

In [6]:
import logging

def get_logger(filename, verbosity=1, name=None):
    level_dict = {0: logging.DEBUG, 1: logging.INFO, 2: logging.WARNING}
    formatter = logging.Formatter(
        "[%(asctime)s][%(filename)s][line:%(lineno)d][%(levelname)s] %(message)s"
    )
    logger = logging.getLogger(name)
    logger.setLevel(level_dict[verbosity])

    fh = logging.FileHandler(filename, "w")
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    sh = logging.StreamHandler()
    sh.setFormatter(formatter)
    logger.addHandler(sh)

    return logger

In [10]:
class BERT_Model(nn.Module):
    def __init__(self):
        super(BERT_Model, self).__init__()
        self.hidden_size = 768
        self.bert = BertModel.from_pretrained(BERT_MODEL_PATH, output_hidden_states=True)
        self.linear = nn.Linear(self.hidden_size, 1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, input_ids, seg_ids, labels_ids=None, mode='test'):
        attention_mask = (input_ids > 0)
        last_states, pooled_output, hidden_states = self.bert(input_ids=input_ids, token_type_ids=seg_ids, attention_mask=attention_mask)
        pooled_output = self.dropout(pooled_output)
        seq_relationship_score = self.linear(pooled_output)

        if labels_ids is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(seq_relationship_score.view(-1,1).float(), labels_ids.view(-1,1).float())
            return loss
        else:
            return seq_relationship_score

In [12]:
def f1_metric(pred_results, true_results):
    totol_number=0
    predict_number=0
    predict_correct=0
    precision = 0
    recall = 0
    f1_score = 0
    for i in range(len(pred_results)):
        pred = pred_results[i]
        true = true_results[i]
        if pred == 1:
            predict_number += 1
        if true == 1:
            totol_number += 1
        if pred == 1 and true == 1:
            predict_correct += 1
    print('实际为1个数：{}   预测为1个数：{}   预测正确个数：{}  '.format(totol_number,predict_number,predict_correct))
    if predict_number == 0:
        precision = 0
    else:
        precision = predict_correct/predict_number
    if totol_number == 0:
        recall = 0
    else:
        recall = predict_correct/totol_number
    if precision == 0 or recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * precision * recall / (precision + recall)
    return precision, recall, f1_score

def validation_funtion(model, valid_dataloader, valid_torchdata, mode):
    model.eval()
    results = []
    true_label = []
    if valid_torchdata.dataset[0]['labels'] != None:
        for i, (input_ids, seg_ids, label_ids) in enumerate(tqdm_notebook(valid_dataloader)):
            output = model(input_ids.to(DEVICE), seg_ids.to(DEVICE), mode='test')
            results += list(output.detach().cpu()) 
            true_label += list(label_ids)
    else:
        for i, (input_ids, seg_ids) in enumerate(tqdm_notebook(valid_dataloader)):
            output = model(input_ids.to(DEVICE), seg_ids.to(DEVICE), None, mode='test')
            results += list(output.detach().cpu())    
    key = 0.5
    results = [1 if result > key else 0 for result in results]
    if mode == 'valid':
        precision, recall, f1_score = f1_metric(results, true_label)
        return precision, recall, f1_score
    else:
        return results
                            
def train(model, train_dataloader, valid_dataloader, valid_torchdata, epochs, early_stop=None,logger=None):
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.8},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)
    total_loss = []
    train_loss = []
    best_f1 = -np.inf
    for epoch in range(epochs):
        model.train()
        bar = tqdm_notebook(train_dataloader)
        for i, (input_ids, seg_ids, label_ids) in enumerate(bar):
            output = model(input_ids.to(DEVICE), seg_ids.to(DEVICE), label_ids.to(DEVICE), mode='train')
            loss = output
            loss.backward()
            train_loss.append(loss.item())            
            optimizer.step()
            optimizer.zero_grad()
            bar.set_postfix(tloss=np.array(train_loss).mean())
        precision, recall, f1_score = validation_funtion(model, valid_dataloader, valid_torchdata, 'valid')
        print('train_loss: {}, precision: {}, recall: {}, f1_score: {}\n'.format(train_loss[-1], precision, recall, f1_score))
        logger.info('Epoch:[{}]\t precision={:.3f}\t recall={:.3f}\t f1_score={:.3f}'.format(epoch, precision, recall, f1_score))
        global model_num
        if epoch == epochs-2:
            torch.save(model.state_dict(), 'model/{}_model_{}.bin'.format(NAME,model_num))
            model_num += 1

In [13]:
FOLD = 5
kf = KFold(n_splits=FOLD, shuffle=True, random_state=SEED)
model_num = 1
test_preds_total = collections.defaultdict(list)
logger = get_logger('logging/{}.log'.format(NAME))
for i, (train_index, test_index) in enumerate(kf.split(train_dataset[:500])):
    print(str(i+1), '-'*50)
    tra = [train_dataset[index] for index in train_index]
    val = [train_dataset[index] for index in test_index]
    train_dataloader, _ = get_dataloader(tra, mode='train')
    valid_dataloader, valid_torchdata = get_dataloader(val, mode='valid')
    model = BERT_Model()
    model.to(DEVICE)
    losses = train(model,train_dataloader,
                    valid_dataloader,
                    valid_torchdata,
                    epochs=10,
                    early_stop=2,
                    logger=logger)
    torch.cuda.empty_cache()

1 --------------------------------------------------


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=115.0), HTML(value='')))

[2021-03-06 01:11:04,815][<ipython-input-12-504499fe7324>][line:76][INFO] Epoch:[0]	 precision=0.000	 recall=0.000	 f1_score=0.000
[2021-03-06 01:11:04,815][<ipython-input-12-504499fe7324>][line:76][INFO] Epoch:[0]	 precision=0.000	 recall=0.000	 f1_score=0.000
[2021-03-06 01:11:04,815][<ipython-input-12-504499fe7324>][line:76][INFO] Epoch:[0]	 precision=0.000	 recall=0.000	 f1_score=0.000



实际为1个数：231   预测为1个数：0   预测正确个数：0  
train_loss: 0.24565573036670685, precision: 0, recall: 0.0, f1_score: 0

2 --------------------------------------------------


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=114.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=114.0), HTML(value='')))

[2021-03-06 01:11:33,140][<ipython-input-12-504499fe7324>][line:76][INFO] Epoch:[0]	 precision=0.000	 recall=0.000	 f1_score=0.000
[2021-03-06 01:11:33,140][<ipython-input-12-504499fe7324>][line:76][INFO] Epoch:[0]	 precision=0.000	 recall=0.000	 f1_score=0.000
[2021-03-06 01:11:33,140][<ipython-input-12-504499fe7324>][line:76][INFO] Epoch:[0]	 precision=0.000	 recall=0.000	 f1_score=0.000



实际为1个数：218   预测为1个数：0   预测正确个数：0  
train_loss: 0.23786219954490662, precision: 0, recall: 0.0, f1_score: 0



In [14]:
test_preds_total = []
test_dataloader, test_torchdata = get_dataloader(test_dataset, mode='test')
for i in range(1,model_num):
    model.load_state_dict(torch.load('model/{}_model_{}.bin'.format(NAME, i)))
    test_pred_results, _ = validation_funtion(model, test_dataloader, test_torchdata, 'test')
    test_preds_total.append(test_pred_results)


HBox(children=(FloatProgress(value=0.0, max=14000.0), HTML(value='')))




In [15]:
test_preds_merge = []
for x in range(len(test_preds_total[0])):
    Sum = test_preds_total[0][x]+test_preds_total[1][x]+test_preds_total[2][x]+test_preds_total[3][x]+test_preds_total[4][x]
    test_preds_merge.append(Sum)
for x in range(len(test_preds_merge)):
    if test_preds_merge[x] > 2:
        test_preds_merge[x] = 1
    else:
        test_preds_merge[x] = 0
def result_deal():
    result = pd.DataFrame()
    query_id = []
    reply_id = []
    for x in range(len(test_dataset)):
        for y in range(len(test_dataset[x]['reply_id'])):
            query_id.append(test_dataset[x]['query_id'])
            reply_id.append(test_dataset[x]['reply_id'][y])
    result = pd.DataFrame({'query_id':query_id, 'reply_id':reply_id, 'labels': test_preds_merge})
    result.to_csv('result/{}_predict.tsv'.format(NAME), sep='\t',encoding='gbk',header=None,index = None)
    
result_deal()
Count = test_preds_merge.count(1)
print(Count)

IndexError: list index out of range