In [19]:
import argparse
import glob
import json
import logging
import os
import random

In [20]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader,RandomSampler,SequentialSampler,TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm,trange

In [21]:
from transformers import (WEIGHTS_NAME,
                         BertConfig,
                         BertModel,
                         BertTokenizer,)

from transformers import glue_compute_metrics as compute_metrics
from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors

from transformers.data.processors.utils import InputExample,DataProcessor

In [22]:
try:
    from torch.utils.tensorboard import SummaryWriter   #version 1.14 or higher
except ImportError:
    from tensorboardX import SummaryWriter

In [23]:
import code
import pickle
from  sklearn.metrics.pairwise import cosine_similarity

In [25]:
logger=logging.getLogger(__name__)
MODEL_CLASSES={
    "bert":(BertConfig,BertModel,BertTokenizer),
}

In [32]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    #if args.n_gpu>0:
    #    torch.cuda.manual_seed_all(seed)

In [46]:
class FAQProcessor(DataProcessor):
    # def get_example_from_tensor_dict(self,tensor_dict):
    #    return InputExample(
    #        tensor_dict["idx"].numpy(),
    #        tensor_dict["sentence"].numpy().decode("utf-8"),
    #        None,
    #        str(tensor_dict["label"].numpy()),
    #    )
    
    def get_candidates(self,file_dir):
        train_df=pd.read_csv(file_dir)
        candidates=train_df[train_df["is_best"]==1][["title","reply"]]
        self.candidate_title=candidates["title"].tolist()
        self.candidate_reply=candidates["reply"].tolist()
        return self._create_examples(self.candidate_title,"train")
    
    def _create_examples(self,lines,set_type):
        examples=[]
        for (i,line) in enumerate(lines):
            guid="%s-%s" % (set_type,i)
            examples.append(InputExample(guid=guid,text_a=line,text_b=None,label=1))
        return examples

In [52]:
def evaluate(args,model,eval_dataset):
    pooled_outputs=[]
    
    
    eval_sampler=SequentialSampler(eval_dataset)
    eval_dataloader=DataLoader(eval_dataset,sampler=eval_sampler,batch_size=16)
    
    
    logger.info(" Num examples = %d",len(eval_dataset))
    logger.info(" Batch size = %d",16)
    eval_loss=0.0
    nb_eval_steps=0
    preds=None
    ouut_label_ids=None
    
    for batch in tqdm(eval_dataloader,desc="Evaluating"):
        model.eval()
        batch=tuple(batch)
        
        with torch.no_grad():
            inputs={"input_ids": batch[0],"attention_mask":batch[1]}
            if args["model_type"]!="distilbert":
                inputs["token_type_ids"]=(
                    batch[2] if args["model_type"] in ["bert","xlnet"] else None
                )
            
            outputs=model(**inputs)
            # 关于这两个outputs
            #
            # 1.sequence_output 代表每个单词的向量,和ELMo出来的一样
            #   size是: batch_size * seq_len * hidden_size, 32*句子里单词数量*768
            #   在计算cosine similarity时, 也是可以用和ELMo方法一样,在seq_len上求平均
            #   理论上这么做比用pooled_output好点,因为[CLS] token不太很能表示句子信息
            #
            # 2.pooled_output代表句子的向量
            #   它其实是[CLS] tokens经过一个Linear层和一个tanh层的结果 
            #   size是: batch_size * hidden_size,  32*768
            sequence_output,pooled_output=outputs[:2]
            pooled_outputs.append(pooled_output)
            
    return pooled_outputs  

In [50]:
def load_examples(args,task, tokenizer):
    processor = FAQProcessor()
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args["data_dir"],
        "cached_{}_{}_{}".format(
            list(filter(None, args["model_name_or_path"].split("/"))).pop(),
            str(args["max_seq_length"]),
            str(task),
        ),
    )
    logger.info("Creating features from dataset file at %s", args["data_dir"])
    examples = (
        processor.get_candidates(args["data_dir"]) 
    )
    features = convert_examples_to_features(
        examples,
        tokenizer,
        label_list=[1],
        output_mode="classification",
        max_length=args["max_seq_length"],
        pad_on_left=bool(args["model_type"] in ["xlnet"]),  # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
    )


    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids)
    return dataset, processor.candidate_title, processor.candidate_reply

In [54]:
def main(args):
    
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )


    set_seed()
    task_name = ""
    model_type = args["model_type"]
    
    
    config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
    config = config_class.from_pretrained(
        args["config_name"],
        finetuning_task=task_name, 
        cache_dir=None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args["tokenizer_name"],
        do_lower_case=True,
        cache_dir=None,
    )
    model = model_class.from_pretrained(
        args["model_name_or_path"],
        from_tf=bool(".ckpt" in args["model_name_or_path"]),
        config=config,
        cache_dir=None,
    )

 
    model.to(args["device"])

    logger.info("Training/evaluation parameters %s", args)

    eval_dataset, candidate_title, candidate_reply = load_examples(args, task_name, tokenizer)
    
    pooled_outputs = evaluate(args, model, eval_dataset)
    
    #把所有candidates embedding拼起来, 每个pooled_output是32, 拼起来一共 18677*768
    candidate_embeddings = torch.cat([o.cpu().data for o in pooled_outputs]).numpy()

    # code.interact(local=locals())

    # load dataset
    if not os.path.exists("embeddings.pkl"):
        with open("embeddings.pkl", "wb") as fout:
            pickle.dump([candidate_title, candidate_reply, candidate_embeddings], fout)
    else:
        with open("embeddings.pkl", "rb") as fin:
            candidate_title, candidate_reply, candidate_embeddings = pickle.load(fin)

    while True:
        title = input("你的问题是？\n")
        if len(title.strip()) == 0:
            continue

        examples = [InputExample(guid=0, text_a=title, text_b=None, label=1)]
        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=[1],
            output_mode="classification",
            max_length=args["max_seq_length"],
            pad_on_left=bool(args["model_type"] in ["xlnet"]),  # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
        )

        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)

        dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids)
        pooled_outputs = evaluate(args, model, dataset)
        title_embedding = torch.cat([o.cpu().data for o in pooled_outputs]).numpy()

        scores = cosine_similarity(title_embedding, candidate_embeddings)[0]
        top5_indices = scores.argsort()[-5:][::-1]
        for index in top5_indices:
            print("可能的答案，参考问题：" + candidate_title[index] + "\t答案：" + candidate_reply[index] + "\t得分：" + str(scores[index]))
            print()


In [55]:
args={
    "model_type":"bert",
    "data_dir": "/Users/valleria_ruka/Desktop/FAQ/lawzhidao_filter.csv",
    "model_name_or_path": "/Users/valleria_ruka/Desktop/FAQ/BERT/chinese_wwm_ext_pytorch/",
    "config_name": "/Users/valleria_ruka/Desktop/FAQ/BERT/chinese_wwm_ext_pytorch/",
    "tokenizer_name": "/Users/valleria_ruka/Desktop/FAQ/BERT/chinese_wwm_ext_pytorch/",
    "do_train":False,
    "do_eval":False,
    "evaluate_during_training":False,
    "do_lower_case":False,
    "per_gpu_train_batch_size":8,
    "per_gpu_eval_batch_size":8,
    "gradient_accumulation_steps":1,
    "learning_rate":5e-5,
    "adam_epsilon":1e-8,
    "max_grad_norm":1.0,
    "weight_decay":0.0,
    "max_grad_norm":1.0,
    "max_seq_length":128,
    "device":"cpu",
    
    
}


if torch.cuda.is_available():
    args_eval["device"]="cuda"

main(args)

01/02/2020 21:38:11 - INFO - transformers.configuration_utils -   loading configuration file /Users/valleria_ruka/Desktop/FAQ/BERT/chinese_wwm_ext_pytorch/config.json
01/02/2020 21:38:11 - INFO - transformers.configuration_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": "",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  

01/02/2020 21:38:14 - INFO - transformers.data.processors.glue -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
01/02/2020 21:38:14 - INFO - transformers.data.processors.glue -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
01/02/2020 21:38:14 - INFO - transformers.data.processors.glue -   label: 1 (id = 0)
01/02/2020 21:38:14 - INFO - transformers.data.processors.glue -   *** Example ***
01/02/2020 21:38:14 - INFO - transformers.data.processors.glue -   guid: train-4
01/02/2020 21:38:14 - INFO - transformers.data.processors.glue -   input_

你的问题是？
汽车超速怎么办?


01/02/2020 22:43:16 - INFO - transformers.data.processors.glue -   Writing example 0
01/02/2020 22:43:16 - INFO - transformers.data.processors.glue -   *** Example ***
01/02/2020 22:43:16 - INFO - transformers.data.processors.glue -   guid: 0
01/02/2020 22:43:16 - INFO - transformers.data.processors.glue -   input_ids: 101 3749 6756 6631 6862 2582 720 1215 136 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
01/02/2020 22:43:16 - INFO - transformers.data.processors.glue -   attention_mask: 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
01/02/2020 22:43:16 - INFO - transformers.data.processors.

可能的答案，参考问题：遇到车祸，肇事车辆逃逸后，怎么处理	答案：所谓交通事故逃逸是指行为人在发生交通事故后，为逃避法律追究而逃跑的行为。主观上是为了逃避法律责任的追究。交通事故逃逸的，具体处理如下：1、交通肇事后逃逸或者有其他特别恶劣情节的，处3年以上7年以下有期徒刑。所谓“交通肇事后逃逸”，《解释》第3条规定，是指行为人具有本解释第2条第1款规定和第2款第(1)至(5)项规定的情形之一，在发生交通事故后，为逃避法律追究而逃跑的行为。这里要注意对“交通肇事后逃逸”的认定，首先，交通肇事逃逸的前提条件是“为逃避法律追究”，其次，交通肇事逃逸并没有时间和场所的限定，不应仅理解为“逃离事故现场”，对于肇事后未逃离(或未能逃离)事故现场，而是在将伤者送至医院后或者等待交通管理部门处理的时候逃跑的，也应视为“交通肇事后逃逸”。所谓“其他特别恶劣情节”，《解释》第4条规定：交通肇事具有下列情形之一的，属于“有其他特别恶劣情节”：(1)死亡二人以上或者重伤五人以上，负事故全部或者主要责任的;(2)死亡六人以上，负事故同等责任的;(3)造成公共财产或者他人财产直接损失，负事故全部或者主要责任，无能力赔偿数额在六十万元以上的。2、因交通肇事逃逸致人死亡的，处7年以上有期徒刑。根据《解释》，“因交通肇事逃逸致人死亡”，是指行为人在交通肇事后为逃避法律追究而逃跑，致使被害人因得不到救助而死亡的情形。但刑法理论上对“因交通肇事逃逸致人死亡”形成了诸多不同的观点。本书认为，“因交通肇事逃逸致人死亡”，的心理态度应限于过失，因为交通肇事罪是一种过失犯罪，为保持犯罪构成的纯洁性，其加重构成的心理态度也应是过失。故《解释》规定：行为人在交通肇事后为逃避法律追究，将被害人带离事故现场后隐藏或者遗弃，致使被害人无法得到救助而死亡或者严重残疾的，应当分别依照刑法第232条、第234条第2款的规定，以故意杀人罪或者故意伤害罪定罪处罚。	得分：0.99053067

可能的答案，参考问题：交通事故被人撞了怎么办？	答案：这种情况是交通事故引发的经济纠纷.首先需要拿到交警的事故责任认定书，然后确定双方的责任比例，比如说主要责任次要责任（比如说责任比例是三七开，即主要责任的是7，次要责任是3），按照这个比例，你方的费用对方需要承担70%，你需要收集你的医药费收据票据等，要求对方赔付，如果对方不赔付，你可以收集这

01/02/2020 22:44:00 - INFO - transformers.data.processors.glue -   Writing example 0
01/02/2020 22:44:00 - INFO - transformers.data.processors.glue -   *** Example ***
01/02/2020 22:44:00 - INFO - transformers.data.processors.glue -   guid: 0
01/02/2020 22:44:00 - INFO - transformers.data.processors.glue -   input_ids: 101 2582 720 3416 2823 2526 2360 136 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
01/02/2020 22:44:00 - INFO - transformers.data.processors.glue -   attention_mask: 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
01/02/2020 22:44:00 - INFO - transformers.data.processors.glu

可能的答案，参考问题：工资保密协议如何写?	答案：举例如下：工资保密协议甲方：乙方：为了保护甲乙双方的权益，保守双方的薪酬秘密，维护和谐的工作氛围，促进双方的稳定发展，甲、乙双方在遵循诚实信用、平等自愿的原则下，经协商就薪酬保密事项达成协议，条款如下：一、甲方的工资薪酬体系设置属于甲方的管理信息内容之一，并经甲方采取保密措施，属于甲方的商业秘密。乙方所接触的工资薪酬信息未经甲方允许不得向第三方透露。乙方必须严格遵守甲方的保密制度，不得询问与自己薪酬无关的情况，不得泄漏本人的薪酬待遇情况，不得以任何形式向他人询问薪酬状况。非本人原因知悉他人薪酬情况者，应及时向甲方主管报告，并采取措施防止泄密进一步扩大，不得传播、散步、比对。二、若乙方作为管理人员及相关工作人员参与或接触薪酬管理或工资作业，要严格遵守公司薪酬保密制度，不得向第三方泄漏所知悉的工资薪酬情况，不得在任何非作业场所公开谈论与薪酬有关的事宜，不得在与薪酬作业无关的人员面前进行操作。带有工资信息的资料必须妥善保管，需存放入文件柜或抽屉，不得随意乱放。废弃不用的资料信息，需用粉碎机进行粉碎处理。三、甲方要对乙方的工资薪酬情况进行保密，非因履行工作职责之需要不得向第三方透露。四、违约责任约定：本协议约定的工资保密事项是公司的重要纪律。违反本协议约定的，一经核实给予违纪解除劳动合同的处理。五、本协议正本一式两份，甲乙双方各执一份，作为劳动合同附件，与劳动合同具有同等法律效力。本协议经甲、乙双方签字或盖章之日起生效。甲方（盖章或签字）：乙方（签字）：日期：年月日日期：年月日	得分：0.990789

可能的答案，参考问题：要怎样才能申请法律援助	答案：根据《法律援助条例》相关规定，申请法律援助，要符合援助范围及所需材料。《法律援助条例》第十条公*对下列需要代理的事项，因经济困难没有委托代理人的，可以向法律援助机构申请法律援助：（一）依法请求国家赔偿的；（二）请求给予社会保险待遇或者最低生活保障待遇的；（三）请求发给抚恤金、救济金的；（四）请求给付赡养费、抚养费、扶养费的；（五）请求支付劳动报酬的；（六）主张因见义勇为行为产生的*事权益的。省、自治区、直辖市人*政府可以对前款规定以外的法律援助事项作出补充规定。公*可以就本条第一款、第二款规定的事项向法律援助机构申请法律咨询。第十一条刑事诉讼中有下列情形之一的，公*可以向法

KeyboardInterrupt: 