In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |▊                               | 10kB 18.3MB/s eta 0:00:01[K     |█▌                              | 20kB 2.2MB/s eta 0:00:01[K     |██▏                             | 30kB 3.3MB/s eta 0:00:01[K     |███                             | 40kB 2.1MB/s eta 0:00:01[K     |███▋                            | 51kB 2.6MB/s eta 0:00:01[K     |████▍                           | 61kB 3.1MB/s eta 0:00:01[K     |█████▏                          | 71kB 3.6MB/s eta 0:00:01[K     |█████▉                          | 81kB 4.1MB/s eta 0:00:01[K     |██████▋                         | 92kB 4.6MB/s eta 0:00:01[K     |███████▎                        | 102kB 3.2MB/s eta 0:00:01[K     |████████                        | 112kB 3.2MB/s eta 0:00:01[K     |████████▉                       | 122kB 3.2M

In [0]:
import argparse
import glob
import json
import logging
import os
import random

In [0]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader,RandomSampler,SequentialSampler,TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm,trange

In [4]:
from transformers import (WEIGHTS_NAME,
                         BertConfig,
                         BertModel,
                         BertTokenizer,)

from transformers import glue_compute_metrics as compute_metrics
from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors

from transformers.data.processors.utils import InputExample,DataProcessor

In [0]:
try:
    from torch.utils.tensorboard import SummaryWriter   #version 1.14 or higher
except ImportError:
    from tensorboardX import SummaryWriter

In [0]:
import code
import pickle
from  sklearn.metrics.pairwise import cosine_similarity

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
logger=logging.getLogger(__name__)
MODEL_CLASSES={
    "bert":(BertConfig,BertModel,BertTokenizer),
}

In [0]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    #if args.n_gpu>0:
    #    torch.cuda.manual_seed_all(seed)

In [0]:
class FAQProcessor(DataProcessor):
    # def get_example_from_tensor_dict(self,tensor_dict):
    #    return InputExample(
    #        tensor_dict["idx"].numpy(),
    #        tensor_dict["sentence"].numpy().decode("utf-8"),
    #        None,
    #        str(tensor_dict["label"].numpy()),
    #    )
    
    def get_candidates(self,file_dir):
        train_df=pd.read_csv(file_dir)
        candidates=train_df[train_df["is_best"]==1][["title","reply"]]
        self.candidate_title=candidates["title"].tolist()
        self.candidate_reply=candidates["reply"].tolist()
        return self._create_examples(self.candidate_title,"train")
    
    def _create_examples(self,lines,set_type):
        examples=[]
        for (i,line) in enumerate(lines):
            guid="%s-%s" % (set_type,i)
            examples.append(InputExample(guid=guid,text_a=line,text_b=None,label=1))
        return examples

In [0]:
def evaluate(args,model,eval_dataset):
    pooled_outputs=[]
    
    
    eval_sampler=SequentialSampler(eval_dataset)
    eval_dataloader=DataLoader(eval_dataset,sampler=eval_sampler,batch_size=16)
    
    
    logger.info(" Num examples = %d",len(eval_dataset))
    logger.info(" Batch size = %d",16)
    eval_loss=0.0
    nb_eval_steps=0
    preds=None
    ouut_label_ids=None
    
    for batch in tqdm(eval_dataloader,desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args["device"]) for t in batch)
        
        with torch.no_grad():
            inputs={"input_ids": batch[0],"attention_mask":batch[1]}
            if args["model_type"]!="distilbert":
                inputs["token_type_ids"]=(
                    batch[2] if args["model_type"] in ["bert","xlnet"] else None
                )
            
            outputs=model(**inputs)
            # 关于这两个outputs
            #
            # 1.sequence_output 代表每个单词的向量,和ELMo出来的一样
            #   size是: batch_size * seq_len * hidden_size, 32*句子里单词数量*768
            #   在计算cosine similarity时, 也是可以用和ELMo方法一样,在seq_len上求平均
            #   理论上这么做比用pooled_output好点,因为[CLS] token不太很能表示句子信息
            #
            # 2.pooled_output代表句子的向量
            #   它其实是sequence output的第一个token(即[CLS])经过一个Linear层和一个tanh层的结果 
            #   size是: batch_size * hidden_size,  32*768
            sequence_output,pooled_output=outputs[:2]
            pooled_outputs.append(pooled_output)
            
    return pooled_outputs  

In [0]:
def load_examples(args,task, tokenizer):
    processor = FAQProcessor()
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args["data_dir"],
        "cached_{}_{}_{}".format(
            list(filter(None, args["model_name_or_path"].split("/"))).pop(),
            str(args["max_seq_length"]),
            str(task),
        ),
    )
    logger.info("Creating features from dataset file at %s", args["data_dir"])
    examples = (
        processor.get_candidates(args["data_dir"]) 
    )
    features = convert_examples_to_features(
        examples,
        tokenizer,
        label_list=[1],
        output_mode="classification",
        max_length=args["max_seq_length"],
        pad_on_left=bool(args["model_type"] in ["xlnet"]),  # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
    )


    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids)
    return dataset, processor.candidate_title, processor.candidate_reply

In [0]:
def main(args):
    
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )


    set_seed()
    task_name = ""
    model_type = args["model_type"]
    
    
    config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
    config = config_class.from_pretrained(
        args["config_name"],
        finetuning_task=task_name, 
        cache_dir=None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args["tokenizer_name"],
        do_lower_case=True,
        cache_dir=None,
    )
    model = model_class.from_pretrained(
        args["model_name_or_path"],
        from_tf=bool(".ckpt" in args["model_name_or_path"]),
        config=config,
        cache_dir=None,
    )

 
    model.to(args["device"])

    # load dataset
    if not os.path.exists("/content/drive/My Drive/embeddings.pkl"):
        logger.info("Training/evaluation parameters %s", args)

        eval_dataset, candidate_title, candidate_reply = load_examples(args, task_name, tokenizer)
    
        pooled_outputs = evaluate(args, model, eval_dataset)
    
        #把所有candidates embedding拼起来, 每个pooled_output是32, 拼起来一共 18677*768
        candidate_embeddings = torch.cat([o.cpu().data for o in pooled_outputs]).numpy()

        with open("/content/drive/My Drive/embeddings.pkl", "wb") as fout:
            pickle.dump([candidate_title, candidate_reply, candidate_embeddings], fout)

            
    else:
        with open("/content/drive/My Drive/embeddings.pkl", "rb") as fin:
            candidate_title, candidate_reply, candidate_embeddings = pickle.load(fin)

    while True:
        title = input("你的问题是？\n")
        if len(title.strip()) == 0:
            continue
        
        #[CLS]句子[SEP]None[SEP] 只用了一个句子
        examples = [InputExample(guid=0, text_a=title, text_b=None, label=1)]
        
        
        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=[1],
            output_mode="classification",
            max_length=args["max_seq_length"],
            pad_on_left=bool(args["model_type"] in ["xlnet"]),  # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
        )

        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)

        dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids)
        pooled_outputs = evaluate(args, model, dataset)
        title_embedding = torch.cat([o.cpu().data for o in pooled_outputs]).numpy()

        scores = cosine_similarity(title_embedding, candidate_embeddings)[0]
        top5_indices = scores.argsort()[-5:][::-1]
        for index in top5_indices:
            print("可能的答案，参考问题：" + candidate_title[index] + "\t答案：" + candidate_reply[index] + "\t得分：" + str(scores[index]))
            print()


In [30]:
args={
    "model_type":"bert",
    "data_dir": "/content/drive/My Drive/Data/lawzhidao_filter.csv",
    "model_name_or_path": "/content/drive/My Drive/chinese_wwm_ext_pytorch/",
    "config_name": "/content/drive/My Drive/chinese_wwm_ext_pytorch/",
    "tokenizer_name": "/content/drive/My Drive/chinese_wwm_ext_pytorch/",
    "do_train":False,
    "do_eval":False,
    "evaluate_during_training":False,
    "do_lower_case":False,
    "per_gpu_train_batch_size":8,
    "per_gpu_eval_batch_size":8,
    "gradient_accumulation_steps":1,
    "learning_rate":5e-5,
    "adam_epsilon":1e-8,
    "max_grad_norm":1.0,
    "weight_decay":0.0,
    "max_grad_norm":1.0,
    "max_seq_length":128,
    "device":"cpu",
    
    
}


if torch.cuda.is_available():
    args["device"]="cuda"

main(args)

01/20/2020 07:45:13 - INFO - transformers.configuration_utils -   loading configuration file /content/drive/My Drive/chinese_wwm_ext_pytorch/config.json
01/20/2020 07:45:13 - INFO - transformers.configuration_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": "",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "pruned_heads"

KeyboardInterrupt: ignored

In [0]:
def mean_reciprocal_rank(rs):
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


def evaluate_mmr(args):
    # load model
    set_seed()
    task_name = ""
    model_type = args["model_type"]
    
    
    config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
    config = config_class.from_pretrained(
        args["config_name"],
        finetuning_task=task_name, 
        cache_dir=None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args["tokenizer_name"],
        do_lower_case=True,
        cache_dir=None,
    )
    model = model_class.from_pretrained(
        args["model_name_or_path"],
        from_tf=bool(".ckpt" in args["model_name_or_path"]),
        config=config,
        cache_dir=None,
    )
    model.to(args["device"])
    
    
    
    # load candidate embeddings
    with open("/content/drive/My Drive/embeddings.pkl", "rb") as fin:
        candidate_title, candidate_reply, candidate_embeddings = pickle.load(fin)
    
    
    # load test data
    df=pd.read_csv(args["data_dir"])
    questions=df["question"].tolist()
    matched_questions=df["title"].tolist()
    matched_questions_index = []
    for q in matched_questions:
        flg = False
        for i, _q in enumerate(candidate_title):
            if q == _q:
                matched_questions_index.append([i])
                flg = True
                break
        if flg == False:
            matched_questions_index.append([-1])
    
    matched_questions_index = np.asarray(matched_questions_index)
    
    
    #convert questions in test data to BERT input
    examples = [InputExample(guid=0, text_a=title, text_b=None, label=1) for title in questions]

    
    features = convert_examples_to_features(
        examples,
        tokenizer,
        label_list=[1],
        output_mode="classification",
        max_length=args["max_seq_length"],
        pad_on_left=bool(args["model_type"] in ["xlnet"]),  # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
    )
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)


    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids)
    sequence_outputs = evaluate(args, model, dataset)
    question_embedding = torch.cat([o.cpu() for o in sequence_outputs]).numpy()


    
    
    scores = cosine_similarity(question_embedding, candidate_embeddings)
    sorted_indices = scores.argsort()[:, ::-1]#[-5:][::-1]
    # code.interact(local=locals())
    mmr = mean_reciprocal_rank(sorted_indices==matched_questions_index)
    print("mean reciprocal rank: {}".format(mmr))


In [44]:
args={
    "model_type":"bert",
    "data_dir": "/content/drive/My Drive/Data/lawzhidao_evaluate.csv",
    "model_name_or_path": "/content/drive/My Drive/chinese_wwm_ext_pytorch/",
    "config_name": "/content/drive/My Drive/chinese_wwm_ext_pytorch/",
    "tokenizer_name": "/content/drive/My Drive/chinese_wwm_ext_pytorch/",
    "do_train":False,
    "do_eval":False,
    "evaluate_during_training":False,
    "do_lower_case":False,
    "per_gpu_train_batch_size":8,
    "per_gpu_eval_batch_size":8,
    "gradient_accumulation_steps":1,
    "learning_rate":5e-5,
    "adam_epsilon":1e-8,
    "max_grad_norm":1.0,
    "weight_decay":0.0,
    "max_grad_norm":1.0,
    "max_seq_length":128,
    "device":"cpu",
    
    
}


if torch.cuda.is_available():
    args["device"]="cuda"
    
    
evaluate_mmr(args)

01/20/2020 08:09:31 - INFO - transformers.configuration_utils -   loading configuration file /content/drive/My Drive/chinese_wwm_ext_pytorch/config.json
01/20/2020 08:09:31 - INFO - transformers.configuration_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": "",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "pruned_heads"

mean reciprocal rank: 0.1838426239890649
