In [1]:
import mindspore
import os
import json
import argparse
import logging
from transformers import BertTokenizer
from mindnlp.models.gpt2 import gpt2
from mindspore import ops
from mindspore.ops import operations as P

In [26]:
"""
Sets up the training arguments.
"""
parser = argparse.ArgumentParser()
parser.add_argument('--device', default='5', type=str, required=False, help='生成设备')
parser.add_argument('--temperature', default=1, type=float, required=False, help='生成的temperature')
parser.add_argument('--topk', default=10, type=int, required=False, help='最高k选1')
parser.add_argument('--topp', default=0, type=float, required=False, help='最高积累概率')
parser.add_argument('--model_config', default='/GPT2-Summary-mindspore/summary_model/config.json', type=str, required=False,
                    help='模型参数')
parser.add_argument('--log_path', default='GPT2-Summary-mindspore/data/interacting.log', type=str, required=False, help='interact日志存放位置')
parser.add_argument('--voca_path', default='GPT2-Summary-mindspore/vocabulary/vocab_small.txt', type=str, required=False, help='选择词库')
parser.add_argument('--dialogue_model_path', default='/home/daiyuxin/cjh1/news_summary/GPT2-Summary-mindspore/summary_model/model_epoch4', type=str, required=False, help='对话模型路径')
parser.add_argument('--save_samples_path', default="GPT2-Summary-mindspore/sample/", type=str, required=False, help="保存聊天记录的文件路径")
parser.add_argument('--repetition_penalty', default=1.2, type=float, required=False,
                    help="重复惩罚参数，若生成的对话重复性较高，可适当提高该参数")
parser.add_argument('--seed', type=int, default=None, help='设置种子用于生成随机数，以使得训练的结果是确定的')
parser.add_argument('--max_len', type=int, default=120, help='每个utterance的最大长度,超过指定长度则进行截断')
parser.add_argument('--max_history_len', type=int, default=1, help="dialogue history的最大长度")
parser.add_argument('--no_cuda', default=False, help='不使用GPU进行预测')

args = parser.parse_args(args=[])
os.environ["CUDA_VISIBLE_DEVICES"] = args.device

In [27]:
def create_logger(args):
    """
    将日志输出到日志文件和控制台
    """
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)

    formatter = logging.Formatter(
        '%(asctime)s - %(levelname)s - %(message)s')

    # 创建一个handler，用于写入日志文件
    file_handler = logging.FileHandler(
        filename=args.log_path)
    file_handler.setFormatter(formatter)
    file_handler.setLevel(logging.INFO)
    logger.addHandler(file_handler)

    # 创建一个handler，用于将日志输出到控制台
    console = logging.StreamHandler()
    console.setLevel(logging.DEBUG)
    console.setFormatter(formatter)
    logger.addHandler(console)

    return logger

logger = create_logger(args)

In [28]:
# 加载模型参数进入模型中
tokenizer = BertTokenizer(vocab_file=args.voca_path)
model = gpt2.GPT2LMHeadModel.from_pretrained(args.dialogue_model_path)
model.set_train(False)

GPT2LMHeadModel<
  (transformer): GPT2Model<
    (wte): Embedding<vocab_size=13317, embedding_size=768, use_one_hot=False, embedding_table=Parameter (name=transformer.wte.embedding_table, shape=(13317, 768), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
    (wpe): Embedding<vocab_size=1024, embedding_size=768, use_one_hot=False, embedding_table=Parameter (name=transformer.wpe.embedding_table, shape=(1024, 768), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
    (drop): Dropout<>
    (h): CellList<
      (0): GPT2Block<
        (ln_1): LayerNorm<normalized_shape=(768,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=transformer.h.0.ln_1.gamma, shape=(768,), dtype=Float32, requires_grad=True), beta=Parameter (name=transformer.h.0.ln_1.beta, shape=(768,), dtype=Float32, requires_grad=True)>
        (attn): GPT2Attention<
          (c_attn): Conv1D<>
          (c_proj): Conv1D<>
          (attn_dropout): Dropout<>
          (resi

In [29]:
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. 
    """
    assert len(logits.shape) == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.shape[-1])  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        # mindspore.ops.operations.Topk()返回最后一维最大的top_k个元素，返回值为二维(values,indices)
        # mindspore.ops.TopK()操作来获取一个Tensor中每一行的前k个最大值和它们的索引
        # ...表示其他维度由计算机自行推断
        topk = P.TopK(sorted=True)
        less = ops.Less()
        
        indices_to_remove = less(logits, topk(logits, top_k)[0][-1])
        logits[indices_to_remove] = filter_value  # 对于topk之外的其他元素的logits值设为负无穷

    if top_p > 0.0:
        sorted_logits, sorted_indices = topk(logits, logits.shape[-1])  # 对logits进行递减排序
        #cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
        cumulative_probs = ops.cumsum(ops.softmax(sorted_logits), axis=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove_copy = mindspore.Tensor(sorted_indices_to_remove.asnumpy().copy())
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove_copy[..., :-1]
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits

In [30]:
print('***********************Summary model start************************')

with open("GPT2-Summary-mindspore/data/evaluation_with_ground_truth.txt", "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line.strip())
        hypothesis = data["article"]
        for i in range(1):
            if len(hypothesis) : hypothesis = hypothesis[:900]
            input_ids = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头
            input_ids.extend(tokenizer.encode(hypothesis))
            input_ids.append(tokenizer.sep_token_id)
            curr_input_tensor = mindspore.Tensor(input_ids).astype(mindspore.int64)

            generated = []
            # 最多生成max_len个token
            for _ in range(args.max_len):
                outputs = model(input_ids=curr_input_tensor)
                next_token_logits = outputs[0][-1, :]
                
                # 对于已生成的结果generated中的每个token添加一个重复惩罚项，降低其生成概率
                for id in set(generated):
                    next_token_logits[id] /= args.repetition_penalty
                next_token_logits = next_token_logits / args.temperature
                # 对于[UNK]的概率设为无穷小，也就是说模型的预测结果不可能是[UNK]这个token
                next_token_logits[tokenizer.convert_tokens_to_ids('[UNK]')] = -float('Inf')
                filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp)
                
                # mindspore.ops.multinomial表示从候选集合中无放回地进行抽取num_samples个元素，权重越高，抽到的几率越高，返回元素的下标
                next_token = ops.multinomial(ops.softmax(filtered_logits), 1, replacement=False).astype(mindspore.int64)
                if next_token == tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
                    break
                generated.append(next_token.asnumpy().item())
                concat = P.Concat(axis=0)
                curr_input_tensor = concat(curr_input_tensor, next_token)
                
            hypothesis = tokenizer.convert_ids_to_tokens(generated)
            reference = data["summarization"]
            hypothesis = ''.join(hypothesis)
            with open('GPT2-Summary-mindspore/complete/hypothesis_mindspore.txt', 'a') as f1:
                    f1.write(hypothesis.strip() + '\n')
            with open('GPT2-Summary-mindspore/complete/reference_mindspore.txt', 'a') as f2:
                    f2.write(reference.strip() + '\n')

***********************Summary model start************************


KeyboardInterrupt: 

In [31]:
from sumeval.metrics.rouge import RougeCalculator

rouge = RougeCalculator(stopwords=True, lang="en")

In [32]:
 # 读取生成的摘要文件和参考文件的内容
with open('GPT2-Summary-mindspore/complete/hypothesis_mindspore.txt', 'r') as f:
    generated_summary = f.read()
with open('GPT2-Summary-mindspore/complete/reference_mindspore.txt', 'r') as f:
    reference_summary = f.read()

In [33]:
 # 计算 ROUGE 分数
rouge_1 = rouge.rouge_n(summary=generated_summary, references=[reference_summary], n=1)
rouge_2 = rouge.rouge_n(summary=generated_summary, references=[reference_summary], n=2)
rouge_l = rouge.rouge_l(summary=generated_summary, references=[reference_summary])

# 打印 ROUGE 分数
with open('GPT2-Summary-mindspore/complete/score_mindspore.txt', 'w') as f:
    f.write("对得分进行平均之后的结果(mindspore): "+ '\n')
    f.write("ROUGE-1的输出为: " + str(rouge_1) + '\n')
    f.write("ROUGE-2的输出为: " + str(rouge_2) + '\n')
    f.write("ROUGE-L的输出为: " + str(rouge_l) + '\n')
print('ROUGE-1:', rouge_1)
print('ROUGE-2:', rouge_2)
print('ROUGE-L:', rouge_l)

ROUGE-1: 0.6419391573125102
ROUGE-2: 0.3560618388934093
ROUGE-L: 0.293151130632829
