In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
!pip install transformers



In [0]:
import argparse
import glob
import json
import logging
import os
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

In [4]:
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    get_linear_schedule_with_warmup,
    BertConfig,
    BertModel,
    BertPreTrainedModel,
    BertTokenizer,)

In [0]:
from transformers import glue_compute_metrics as compute_metrics
from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors

In [0]:
from transformers.data.processors.utils import InputExample, DataProcessor

In [0]:
try:
    from torch.utils.tensorboard import SummaryWriter   #version 1.14 or higher
except ImportError:
    from tensorboardX import SummaryWriter

In [0]:
import code
import os
import pickle
from sklearn.metrics.pairwise import cosine_similarity
cosine=nn.CosineSimilarity(dim=1,eps=1e-6) 

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
logger=logging.getLogger(__name__)

In [0]:
MODEL_CLASSES={
    "bert":(BertConfig,BertTokenizer),
}

In [0]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [0]:
class FAQProcessor(DataProcessor):
    def get_data_from_file(self, file_dir):
        # 从csv文件读入原句和正例
        train_df = pd.read_csv(file_dir, sep="\t")
        self.candidate_title = train_df["title"].tolist()
        self.candidate_reply = train_df["reply"].tolist()
        self.candidate_translated = train_df["translated"].tolist()


    def create_train_data(self):
        #生成训练数据集
        return self._create_examples(self.candidate_title, "original"),\
                self._create_examples(self.candidate_translated, "pos")

               

    def _create_examples(self, lines, set_type):
        """制作example"""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            examples.append(InputExample(guid=guid, text_a=line, text_b=None, label=1))
        return examples


In [0]:
class BertForFAQHinge(BertPreTrainedModel):
    def __init__(self,config):
        super(BertPreTrainedModel,self).__init__(config)
        self.num_labels=config.num_labels
            
        self.bert=BertModel(config)
        self.dropout=nn.Dropout(config.hidden_dropout_prob)
        
        self.init_weights()
    
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        outputs=self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )   # 输出是 sequence_output, pooled_output, (hidden_states), (attentions)
        
        sequence_output, pooled_output=outputs[:2]
        # 1. CLS token
        output=pooled_output
        # 2. MEAN sequence
        #output=torch.mean(sequence_output,dim=1)
        # 3. MAX sequence
        #output=torch.max(sequence_output,dim=1)[0]
        
        return output

In [0]:
# 只做单个eval,所以inputs只有一个
def evaluate(args,model,eval_dataset):
    outputs=[]

    
    eval_sampler =RandomSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["batch_size"])
    
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args["batch_size"])
    eval_loss=0.0
    nb_eval_steps=0
    preds=None
    out_label_ids=None
    
    for batch in tqdm(eval_dataloader,desc="Evaluating"):
        model.eval()
        batch=tuple(t.to(args["device"]) for t in batch)
        
        with torch.no_grad():
            inputs={"input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2]} 
            output=model(**inputs)
            
            outputs.append(output)

  
    return outputs

In [0]:
def train(args,train_dataset,model,processor,tokenizer):
    no_decay=["bias","LayerNorm.weight"]
    optimizer_grouped_parameters=[
        {
            "params":[p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay":args["weight_decay"],

        },
        {
            "params": [p for n,p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay":0.0
        },
    ]

    
    t_total=len(train_dataset)//args["gradient_accumulation_steps"] * args["num_train_epochs"]
    optimizer=AdamW(optimizer_grouped_parameters,lr=args["learning_rate"],eps=args["adam_epsilon"])
    # bert里的小技巧, bert里的learning rate是不断变化的,先往上升,再往下降,这个scheduler就是用来设置这个
    scheduler=get_linear_schedule_with_warmup(
        optimizer,num_warmup_steps=args["warmup_steps"],num_training_steps=t_total
        )
    
    
    
    # *********************
    logger.info("*****Running training*****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args["num_train_epochs"])


    epochs_trained=0
    global_step=0
    steps_trained_in_current_epoch=0

    tr_loss,logging_loss=0.0,0.0
    model.zero_grad()
    train_iterator=trange(epochs_trained,args["num_train_epochs"],desc="Epoch",disable=False)

    set_seed()

    for k in train_iterator: #共5个epoch
    
        #train_sampler=RandomSampler(train_dataset)
        # 因为原句和正例没有匹配打包,所以这里只能用SequentialSampler, 不能随机
        train_sampler=SequentialSampler(train_dataset)
        train_dataloader=DataLoader(train_dataset,sampler=train_sampler,batch_size=args["batch_size"])
        epoch_iterator=tqdm(train_dataloader,desc="Iteration",disable=False)

        for step,batch in enumerate(epoch_iterator): #每个epoch里555次iteration
            if steps_trained_in_current_epoch>0:
                steps_traned_in_current_epoch-=1
                continue

            model.train()
            batch=tuple(t.to(args["device"]) for t in batch)
            original_inputs={"input_ids":batch[0],"attention_mask":batch[1],"token_type_ids":batch[2]}
            pos_inputs={"input_ids":batch[3],"attention_mask":batch[4],"token_type_ids":batch[5]}

            #根据args传进来的模式, 生成不同的embedding
            original_outputs=model(**original_inputs)
            pos_outputs=model(**pos_inputs)

            # original_outputs,   size是32*768,type是torch.Tensor 



            #这里生成negative example
            neg_outputs=[]
            ''' 
            针对每个原句,在这个batch里找一个负例
            比如这个batch里有32条句子, 我就拿里面每一条句子和其他31条句子比对,找出分数最高的一个作为负例.
            当然因为我把32条句子都整合在candidate_embeddings里面,所以每个句子会和它自己比较. 因此有个for循环
            找到第一个和原句不同的句子即可.
            '''

            for original_embedding in original_outputs:
                scores = cosine(original_embedding.unsqueeze(0), original_outputs).detach().cpu().numpy()
                top3_indices = scores.argsort()[-3:][::-1]
                # 找到第一个和原句不同的句子作为负例,因为分数最高的可能是句子自己
                for index in top3_indices:
                    if not torch.all(original_embedding==original_outputs[index]):
                        neg_outputs.append(original_outputs[index])
                        break

            neg_outputs=torch.stack([c for c in neg_outputs])

            pos_score=cosine(original_outputs,pos_outputs)
            neg_score=cosine(original_outputs,neg_outputs)

            loss=-(pos_score-neg_score-args["margin"])
            loss[loss<0]=0
            loss=torch.mean(loss)
            loss.backward()

            tr_loss+=loss.item()
            if (step+1)%args["gradient_accumulation_steps"]==0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),args["max_grad_norm"])

                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step+=1

        logger.info("average loss:" +str(tr_loss/global_step))


    return global_step,tr_loss/global_step

In [0]:
def load_examples(args, tokenizer, processor):

    original_data,pos_data= processor.create_train_data()
 
    # Load data features from cache or dataset file
    logger.info("Creating features from dataset file at %s", args["data_dir"])

    original_features = convert_examples_to_features(
        original_data,
        tokenizer,
        label_list=[1],
        output_mode="classification",
        max_length=args["max_seq_length"],
        pad_on_left=bool(args["model_type"] in ["xlnet"]),  # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
    )
    pos_features = convert_examples_to_features(
        pos_data,
        tokenizer,
        label_list=[1],
        output_mode="classification",
        max_length=args["max_seq_length"],
        pad_on_left=bool(args["model_type"] in ["xlnet"]),  # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
    )

    # Convert to Tensors and build dataset
    original_input_ids = torch.tensor([f.input_ids for f in original_features], dtype=torch.long)
    original_attention_mask = torch.tensor([f.attention_mask for f in original_features], dtype=torch.long)
    original_token_type_ids = torch.tensor([f.token_type_ids for f in original_features], dtype=torch.long)

    pos_input_ids = torch.tensor([f.input_ids for f in pos_features], dtype=torch.long)
    pos_attention_mask = torch.tensor([f.attention_mask for f in pos_features], dtype=torch.long)
    pos_token_type_ids = torch.tensor([f.token_type_ids for f in pos_features], dtype=torch.long)


    if args["do_train"]:
        dataset = TensorDataset(original_input_ids, original_attention_mask, original_token_type_ids,\
                                pos_input_ids, pos_attention_mask, pos_token_type_ids)
    elif args["do_eval"]:
        dataset = TensorDataset(original_input_ids, original_attention_mask, original_token_type_ids)
    else:
        print("Something wrong in load_examples function!!!!")

    return dataset, processor.candidate_title, processor.candidate_reply

In [0]:
def main(args):
        
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    
    
    set_seed()
    model_type = args["model_type"]
    
    
    config_class, tokenizer_class = MODEL_CLASSES[model_type]
    model_class=BertForFAQHinge

    config = config_class.from_pretrained(
        args["config_name"],
        finetuning_task=args["task_name"], 
        cache_dir=None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args["tokenizer_name"],
        do_lower_case=True,
        cache_dir=None,
    )
    model = model_class.from_pretrained(
        args["model_name_or_path"],
        from_tf=bool(".ckpt" in args["model_name_or_path"]),
        config=config,
        cache_dir=None,
    )
    
    
    model.to(args["device"])
    
    logger.info("Training/evaluation parameters %s", args)
    
    processor=FAQProcessor()
    processor.get_data_from_file(args["data_dir"])
    
    dataset,candidate_title,candidate_reply=load_examples(args,tokenizer,processor)
    
    
    if args["do_train"]:
        train(args,dataset,model,processor,tokenizer)
        if not os.path.exists(args["output_dir"]):
            os.makedirs(args["output_dir"])
        
        logger.info("Saving model checkpoint to %s", args["output_dir"])
        
        model.save_pretrained(args["output_dir"])
        tokenizer.save_pretrained(args["output_dir"])
        
        torch.save(args,os.path.join(args["output_dir"],"training_args.bin"))
        
        model=model_class.from_pretrained(args["output_dir"])
        tokenizer=tokenizer_class.from_pretrained(args["output_dir"])
        model.to(args["device"])
    
    
    if args["do_eval"]:
        # load dataset
        if not os.path.exists("/content/drive/My Drive/hinge_embeddings.pkl"):
            logger.info("Training/evaluation parameters %s", args)

            eval_dataset, candidate_title, candidate_reply = load_examples(args, tokenizer,processor)
        
            outputs = evaluate(args, model, eval_dataset)
        
            #把所有candidates embedding拼起来, 每个pooled_output是32, 拼起来一共 18677*768
            candidate_embeddings = torch.cat([o.cpu().data for o in outputs]).numpy()

            with open("/content/drive/My Drive/hinge_embeddings.pkl", "wb") as fout:
                pickle.dump([candidate_title, candidate_reply, candidate_embeddings], fout)

                
        else:
            with open("/content/drive/My Drive/hinge_embeddings.pkl", "rb") as fin:
                candidate_title, candidate_reply, candidate_embeddings = pickle.load(fin)


        while True:
            title = input("你的问题是？\n")
            if len(title.strip()) == 0:
                continue
            
            examples = [InputExample(guid=0, text_a=title, text_b=None, label=1)]
            features = convert_examples_to_features(
                examples,
                tokenizer,
                label_list=[1],
                output_mode="classification",
                max_length=args["max_seq_length"],
                pad_on_left=bool(args["model_type"] in ["xlnet"]),  # pad on the left for xlnet
                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
            )

            # Convert to Tensors and build dataset
            all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
            all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
            all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    
            dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids)
            outputs = evaluate(args, model, dataset)
            title_embedding = torch.cat([o.cpu().data for o in outputs]).numpy()

            scores = cosine_similarity(title_embedding, candidate_embeddings)[0]
            top5_indices = scores.argsort()[-5:][::-1]

            for index in top5_indices:
                print("可能的答案，参考问题：" + candidate_title[index] + "\t答案：" + candidate_reply[index] + "\t得分：" + str(scores[index]))
                print()

In [0]:
args_train={
    "model_type":"bert",
    "data_dir": "/content/drive/My Drive/Data/preprocessed.csv",
    "output_dir":"/content/drive/My Drive/cos_hinge_models/",
    "model_name_or_path": "/content/drive/My Drive/chinese_wwm_ext_pytorch/",
    "config_name": "/content/drive/My Drive/chinese_wwm_ext_pytorch/#",
    "tokenizer_name": "/content/drive/My Drive/chinese_wwm_ext_pytorch/",
    "do_train":True,
    "do_eval":False,
    "evaluate_during_training":False,
    "do_lower_case":False,
    "per_gpu_train_batch_size":32,
    "per_gpu_eval_batch_size":32,
    "batch_size":32,
    "gradient_accumulation_steps":1,
    "learning_rate":2e-5,
    "adam_epsilon":1e-8,
    "max_grad_norm":1.0,
    "weight_decay":0.0,
    "max_grad_norm":1.0,
    "max_seq_length":128,
    "weight_decay":0.0,
    "num_train_epochs":10,
    "device":"cpu",
    "margin":5,
    "warmup_steps":0,
    "task_name":"",
    
}

if torch.cuda.is_available():
    args_train["device"]="cuda"


#main(args_train)

In [0]:
args_eval={
    "model_type":"bert",
    "data_dir": "/content/drive/My Drive/Data/preprocessed.csv",
    "output_dir":"/content/drive/My Drive/cos_hinge_models/",
    "model_name_or_path": "/content/drive/My Drive/cos_hinge_models/",
    "config_name": "/content/drive/My Drive/cos_hinge_models/",
    "tokenizer_name": "/content/drive/My Drive/cos_hinge_models/",
    "do_train":False,
    "do_eval":True,
    "evaluate_during_training":False,
    "do_lower_case":False,
    "per_gpu_train_batch_size":32,
    "per_gpu_eval_batch_size":32,
    "batch_size":32,
    "gradient_accumulation_steps":1,
    "learning_rate":2e-5,
    "adam_epsilon":1e-8,
    "max_grad_norm":1.0,
    "weight_decay":0.0,
    "max_grad_norm":1.0,
    "max_seq_length":128,
    "weight_decay":0.0,
    "num_train_epochs":10,
    "device":"cpu",
    "margin":5,
    "warmup_steps":0,
    "task_name":"",
    
    
}

if torch.cuda.is_available():
    args_eval["device"]="cuda"


#main(args_eval)

In [0]:
def mean_reciprocal_rank(rs):
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


def evaluate_mmr(args):
    # load model
    set_seed()
    task_name = ""
    model_type = args["model_type"]
    
    
    config_class, tokenizer_class = MODEL_CLASSES[model_type]
    model_class=BertForFAQHinge
    
    config = config_class.from_pretrained(
        args["config_name"],
        finetuning_task=task_name, 
        cache_dir=None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args["tokenizer_name"],
        do_lower_case=True,
        cache_dir=None,
    )
    model = model_class.from_pretrained(
        args["model_name_or_path"],
        from_tf=bool(".ckpt" in args["model_name_or_path"]),
        config=config,
        cache_dir=None,
    )
    model.to(args["device"])
    
    
    
    # load candidate embeddings
    with open("/content/drive/My Drive/hinge_embeddings_cls.pkl", "rb") as fin:
        candidate_title, candidate_reply, candidate_embeddings = pickle.load(fin)
    
    
    # load test data
    df=pd.read_csv(args["data_dir"])
    questions=df["question"].tolist()
    matched_questions=df["title"].tolist()
    matched_questions_index = []
    for q in matched_questions:
        flg = False
        for i, _q in enumerate(candidate_title):
            if q == _q:
                matched_questions_index.append([i])
                flg = True
                break
        if flg == False:
            matched_questions_index.append([-1])
    
    matched_questions_index = np.asarray(matched_questions_index)
    
    
    #convert questions in test data to BERT input
    examples = [InputExample(guid=0, text_a=title, text_b=None, label=1) for title in questions]

    
    features = convert_examples_to_features(
        examples,
        tokenizer,
        label_list=[1],
        output_mode="classification",
        max_length=args["max_seq_length"],
        pad_on_left=bool(args["model_type"] in ["xlnet"]),  # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
    )
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)


    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids)
    sequence_outputs = evaluate(args, model, dataset)
    question_embedding = torch.cat([o.cpu() for o in sequence_outputs]).numpy()


    
    
    scores = cosine_similarity(question_embedding, candidate_embeddings)
    sorted_indices = scores.argsort()[:, ::-1]#[-5:][::-1]
    # code.interact(local=locals())
    mmr = mean_reciprocal_rank(sorted_indices==matched_questions_index)
    print("mean reciprocal rank: {}".format(mmr))

In [28]:
args={
    "model_type":"bert",
    "data_dir": "/content/drive/My Drive/Data/lawzhidao_evaluate.csv",
    "model_name_or_path": "/content/drive/My Drive/cos_hinge_models/",
    "config_name": "/content/drive/My Drive/cos_hinge_models/",
    "tokenizer_name": "/content/drive/My Drive/cos_hinge_models/",
    "do_train":False,
    "do_eval":False,
    "evaluate_during_training":False,
    "do_lower_case":False,
    "per_gpu_train_batch_size":32,
    "per_gpu_eval_batch_size":32,
    "batch_size":32,
    "gradient_accumulation_steps":1,
    "learning_rate":5e-5,
    "adam_epsilon":1e-8,
    "max_grad_norm":1.0,
    "weight_decay":0.0,
    "max_grad_norm":1.0,
    "max_seq_length":128,
    "device":"cpu",
    
    
}


if torch.cuda.is_available():
    args["device"]="cuda"
    
    
evaluate_mmr(args)

Evaluating: 100%|██████████| 2/2 [00:00<00:00,  6.02it/s]


mean reciprocal rank: 0.00023563829506073443
