In [1]:
import pandas as pd
import numpy as np
import os
import torch
from collections import defaultdict
from itertools import combinations, chain
import pickle
import copy
from docx import Document
from zhon import hanzi

import re
from zhon import hanzi

from tqdm import tqdm
tqdm.pandas(desc="progress: ")

from importlib import reload

# Utility variable
import sys, getopt
sys.path.insert(0, '../..')

# var
import var.var as V
import var.path as P

# utils
import utils.data as D
import utils.io as IO
import utils.preprocess as PP

In [2]:
import torch
from torch import Tensor

## Process Command Line Arguments

In [3]:
opts, args = getopt.getopt(sys.argv[1:], "d:e:m:l:s:f:t")

In [4]:
SIGNIFICANCE_MODEL_SAVE_DIR_NAME = "significance_pHAN_cmt_cos_dist_wo_cmt_aug_all_2022-12-12_mixed_3"
EPOCH = 20
MAX_SENT = 8
UNI_MAX_SENT = 2
MAX_SENT_LEN = 60
LAMBDA = 0.3
NORM_RATIO = 4
VAL_OR_TEST = 'test'
debug = False
EVIDENCE_SCORE_THRESHOLD = 0.75

for opt, arg in opts:
    if opt == '-d':
        SIGNIFICANCE_MODEL_SAVE_DIR_NAME = arg
    elif opt == '-e':
        EPOCH = int(arg)
    elif opt == '-s':
        MAX_SENT = int(arg)
    elif opt == '-m':
        MAX_SENT_LEN = int(arg)
    elif opt == '-l':
        LAMBDA = float(arg)
    elif opt == '-t':
        VAL_OR_TEST = 'test'
    elif opt == '-f':
        debug = True

In [5]:
if 'train' in SIGNIFICANCE_MODEL_SAVE_DIR_NAME:
    TRAIN_OR_ALL = 'train'
elif 'all' in SIGNIFICANCE_MODEL_SAVE_DIR_NAME:
    TRAIN_OR_ALL = 'all'
    
if 'wo' in SIGNIFICANCE_MODEL_SAVE_DIR_NAME:
    COMMENT_AUGMENTATION = False
else:
    COMMENT_AUGMENTATION = True

### BERT NSP Model

In [6]:
GPU_NUM = 0
device = torch.device(GPU_NUM)

In [7]:
BERT_NSP_MODEL_NAME = 'bert-base-chinese'
BERT_NSP_TOKENIZER_NAME = 'bert-base-chinese'

In [8]:
from transformers import BertTokenizerFast, BertForNextSentencePrediction

In [9]:
bert_nsp_tokenizer = BertTokenizerFast.from_pretrained(BERT_NSP_TOKENIZER_NAME)
bert_nsp_model = BertForNextSentencePrediction.from_pretrained(BERT_NSP_MODEL_NAME).to(device)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
SBERT_MODEL_NAME = 'ckiplab/bert-base-chinese'

In [11]:
from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer(SBERT_MODEL_NAME).to(device)

Some weights of the model checkpoint at /home/pclightyear/.cache/torch/sentence_transformers/ckiplab_bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at /home/pclightyear/.cache/torch/sentence_transf

### Utils

In [12]:
def defaultdict_init_defaultdict_init_by_int():
    return defaultdict(int)

def defaultdict_init_defaultdict_init_by_float():
    return defaultdict(float)

def defaultdict_init_defaultdict_init_by_str():
    return defaultdict(str)

## Build the reference citation dictionary

In [13]:
df_recommendation_letters = D.read_df_recommendation_letters()

In [None]:
df_recommendation_letters.tail()

In [15]:
rl_info_dict = defaultdict(defaultdict_init_defaultdict_init_by_str)

for _, row in df_recommendation_letters.iterrows():
    _year = int(row['year'])
    _id = int(row['id'])

    rl_sent = row['all_paragraph_sent']
    info = "，".join([info for info in row['info'] if len(info) <= 10])
    
    if info == "":
        continue
        
    sent_info_dict = defaultdict_init_defaultdict_init_by_str()
    
    for sent in rl_sent:
        sent_info_dict[sent] = info
        
    rl_info_dict[(_year, _id)] = rl_info_dict[(_year, _id)] | sent_info_dict    

## Calculate the evidence score for each sentence

In [16]:
significance_pseudo_summary_dir = os.path.join(P.FP_SIGNIFICANCE_PSEUDO_SUMMARY_DIR, 'custom_bertopic', TRAIN_OR_ALL)
significance_all_data_dir = os.path.join(significance_pseudo_summary_dir, 'all_data')

uniqueness_pseudo_summary_dir = os.path.join(P.FP_UNIQUENESS_PSEUDO_SUMMARY_DIR, 'custom_bertopic', TRAIN_OR_ALL)
uniqueness_all_data_dir = os.path.join(uniqueness_pseudo_summary_dir, 'all_data')

In [None]:
i = 0
sent_evidence_score_dict = defaultdict(defaultdict_init_defaultdict_init_by_float)

for file in tqdm(os.listdir(significance_all_data_dir)):
    fn = os.path.join(significance_all_data_dir, file)
    
#     IO.print_dividing_line()
#     if i >= 1:
#         break
    
    if os.path.isdir(fn):
        continue

#     print(fn)
        
    with open(fn, "rb") as f:
        group_data = pickle.load(f)

    ## process group data
    candidate_sents_info_buffer = group_data["candidate_sents_info_buffer"]
    chunk_debug_info_buffer = group_data["chunk_debug_info_buffer"]
    
#     print(candidate_sents_info_buffer)
#     print(chunk_debug_info_buffer)

    for info, debug_info in chunk_debug_info_buffer.items():
        buffer_dict = defaultdict_init_defaultdict_init_by_float()
        
#         print(info, debug_info)
        sents = candidate_sents_info_buffer[info]['sents']
        chunks = debug_info['chunks']
        chunk_evidence_scores = debug_info['evidence_score']
    
#         print(info)
#         print(sents)
#         print(chunks)
#         print(chunk_evidence_scores)
        
        for chunk, chunk_evidence_score in zip(chunks, chunk_evidence_scores):
            # find corresponding sentences
            for sent in sents:
                ## aggregate sent evidence score
                if chunk in sent:
                    buffer_dict[sent] = max(
                        buffer_dict[sent], chunk_evidence_score
                    )
                    
        sent_evidence_score_dict[info] = buffer_dict
    
    i += 1

In [None]:
len(sent_evidence_score_dict)

## All data

In [None]:
all_candidate_sents_info_buffer = {}
all_chunk_debug_info_buffer = {}

for file in tqdm(os.listdir(significance_all_data_dir)):
    fn = os.path.join(significance_all_data_dir, file)
    
    if os.path.isdir(fn):
        continue
        
    with open(fn, "rb") as f:
        group_data = pickle.load(f)
        
    candidate_sents_info_buffer = group_data["candidate_sents_info_buffer"]
    chunk_debug_info_buffer = group_data["chunk_debug_info_buffer"]
    
    all_candidate_sents_info_buffer |= candidate_sents_info_buffer
    all_chunk_debug_info_buffer |= chunk_debug_info_buffer

In [None]:
len(all_candidate_sents_info_buffer)

In [None]:
len(all_chunk_debug_info_buffer)

## Long sentence post process utils

In [22]:
from sentence_transformers.util import cos_sim

In [23]:
def get_sent_len(s):
    re_alphanumeric = '[a-zA-Z0-9_]+'
    re_ch_p = '[{}]'.format(hanzi.characters + hanzi.punctuation)
    
    l = 0
    
    ## find all english and number token
    l += len(re.findall(re_alphanumeric, s))
    s = re.sub(re_alphanumeric, '', s)
    
    ## remove whitespace
    s = re.sub('\s', '', s)
    
    ## count chinese character
    l += len(re.findall(re_ch_p, s))
    
    return l

In [24]:
def find_longest_consecutive_sequence(seq):
    try:
        assert len(seq) >= 1
    except:
        print(seq)
        assert 1 == 0
    
    cand_seqs = []
    
    if len(seq) == 1:
        return seq

    cur_num = seq[0]
    seq_buf = seq[:1]

    for num in seq[1:]:
        if num == cur_num + 1:
            seq_buf.append(num)
        else:
            cand_seqs.append(seq_buf)
            seq_buf = [num]

        cur_num = num

    cand_seqs.append(seq_buf)
    sorted_cand_seqs = sorted(cand_seqs, key=lambda l: -len(l))
    
    return sorted_cand_seqs[0]

In [25]:
def is_reasonable_sent(chunks, tokenizer, model, debug=False):
    ## get all sublist with length >= 2
    sublist_idx = []
    for start_idx, end_idx in combinations(range(len(chunks)+1), 2):
        if end_idx - start_idx > 1:
            sublist_idx.append((start_idx, end_idx))
    
    ## split sublist into two list with len >= 1
    text_pair = []
    for start_idx, end_idx in sublist_idx:
        sublist = chunks[start_idx:end_idx]
        for pivot in range(1, len(sublist)):
            l_sublist = sublist[0:pivot]
            r_sublist = sublist[pivot:len(sublist)]
            text_pair.append(('，'.join(l_sublist)+'。', '，'.join(r_sublist)+'。'))
    
    inputs = tokenizer(text_pair, return_tensors='pt', padding=True)

    for key in inputs:
        if isinstance(inputs[key], Tensor):
            inputs[key] = inputs[key].to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        results = torch.argmax(outputs.logits, dim=-1)
        
    if debug:
        print(outputs)
        for p in text_pair:
            print(p)
        print(results)
    
    return bool(sum(results) == 0)

In [26]:
hanzi.punctuation

'＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·！？｡。'

In [27]:
split_punc = '\n＂＃＄％＆＇＊＋，－／：；＜＝＞＠＼＾＿｀｜～､\u3000、〃〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·！？｡。'

In [28]:
parenthesis_punc = '＂（）［］｛｝｟｠｢｣〈〉《》「」『』【】〔〕〖〗〘〙〚〛'

In [29]:
def long_sentence_post_process(s, debug_info, ext_summary, debug=False):
    ## get debug chunk info
    debug_chunks_idx = []
    for i, chunk in enumerate(debug_info['chunks']):
        if chunk in s:
            debug_chunks_idx.append(i)

    if len(debug_chunks_idx) == 0:
        return ""
            
    ## find the longest consecutive sequence
    debug_chunks_idx = find_longest_consecutive_sequence(debug_chunks_idx)
    
    if debug:
        for _idx in debug_chunks_idx:
            print(debug_info['chunks'][_idx])
    
    ## directly split s to get all chunks
    cand_chunks = re.split('[{}]'.format(split_punc), s)
    cand_chunks = [c for c in cand_chunks if c]
    
    ## compute the len of each chunk
    chunks_len = [get_sent_len(cc) for cc in cand_chunks]
    ## brutal force to find text span that is below length limit
    span_tuple = combinations(range(len(cand_chunks)+1), 2)
    f_span_tuple = []

    if debug:
        print("cand chunks:", cand_chunks)
    
    for start_idx, end_idx in span_tuple:
        spans = cand_chunks[start_idx:end_idx]
    #     print(spans)
        ## check if satisfy length limit
        chunk_len = sum(chunks_len[start_idx:end_idx]) + end_idx - start_idx - 1
        if chunk_len <= MAX_SENT_LEN:
            f_span_tuple.append((start_idx, end_idx))
    
    if debug:
        print("span tuples:", f_span_tuple)
    
    ## remove text span that is the subset of other text span
    filtered_span_tuple = []
    for i, (start_idx, end_idx) in enumerate(f_span_tuple):
        spans = cand_chunks[start_idx:end_idx]
        unique = True
        for j, (_start_idx, _end_idx) in enumerate(f_span_tuple):
            if i == j:
                continue
            if _start_idx <= start_idx and end_idx <= _end_idx:
                unique = False

        if unique:
#             print('，'.join(spans))
            filtered_span_tuple.append((start_idx, end_idx))

    if debug:
        print("spans after brutal force search")
        print(filtered_span_tuple)
    
    ## remove unreasonable sentence
    buf = []
    for start_idx, end_idx in filtered_span_tuple:
        chunks = cand_chunks[start_idx:end_idx]
        if end_idx - start_idx == 1:
            buf.append((start_idx, end_idx))
        elif is_reasonable_sent(chunks, bert_nsp_tokenizer, bert_nsp_model):
            buf.append((start_idx, end_idx))
            
    filtered_span_tuple = buf

    if debug:
        print("spans after nsp filter")
        print(filtered_span_tuple)
    
    if len(filtered_span_tuple) == 0:
        return ''
    
    cand_score = []
    ## calculate final score
    for start_idx, end_idx in filtered_span_tuple:
        cc = cand_chunks[start_idx:end_idx]
        if debug:
#             print(cc)
            pass
        
        ## aggregate imp score from real chunks
        ## should contain inportant info (high importance)
        cand_importance = 0
        for _c in cc:
            for _idx in debug_chunks_idx:
                if _c in debug_info['chunks'][_idx]:
                    cand_importance += debug_info['importance'][_idx]
                    break
        
        cand_importance /= len(cc)
        
#         ## info should be novel (dissimilar with existing summary)
        cc_embed = sbert_model.encode('，'.join(cc), show_progress_bar=False)
        ext_summary_embed = sbert_model.encode(ext_summary, show_progress_bar=False)
        cand_novel = 1 - cos_sim(cc_embed, ext_summary_embed)[0][0]
        
        score = LAMBDA * cand_importance + (1 - LAMBDA) * cand_novel * NORM_RATIO
        cand_score.append(score)
        if debug:
            print('imp', float(cand_importance))
            print('novel', float(cand_novel))
            print(score)
        
    final_span_idx = np.argmax(cand_score)
    final_start_idx, final_end_idx = filtered_span_tuple[final_span_idx]
    final_chunks = cand_chunks[final_start_idx:final_end_idx]
    final_sent = '，'.join(final_chunks) + '。'
    
    return final_sent

In [30]:
import string
from zhon import hanzi

In [31]:
def sentence_post_process(s):
    s = s.strip()
    try:
        ## remove trailing number
        while s[0] in string.punctuation:
            s = s[1:]
    except:
        return ''
    s = s.strip()
    
    ## remove mojibake
    s = s.replace('\uf06c', '')
    s = s.replace('。，', '，')
    s = s.replace('，。', '，')
    s = s.replace('，nan', '')
    s = s.replace('&lt；', '')
    
    ## remove zh number bullet
    ch_number = "一二三四五六七八九十"
    p = '[{}\d]、'.format(ch_number)
    s = re.sub(p, '', s)
    
    ## remove number bullet
    p = '((?<!\d)\d+\.(?!\d)|★)'
    s = re.sub(p, '',s)
    
    s = s.strip()
    try:
        ## remove trailing number
        while s[-1] in string.digits:
            s = s[:-1]
    except:
        return ''
    s = s.strip()
    
    s = s.strip()
    try:
        ## remove trailing non stop punctuation
        while s[-1] in hanzi.non_stops:
            s = s[:-1]
    except:
        return ''
    s = s.strip()
    
    return s

# Post process the summary

## Load summary dict

### Utils

In [32]:
_dict = {
    "# The content is removed due to confidential concerns."
}

In [33]:
default_talents = ["# The content is removed due to confidential concerns."]

def generate_docx(doc, sum_sent, talents=default_talents, debug=False):    
    no_duplicate_sents = []
    
    for talent in talents:
        sents = sum_sent[talent]
        
        if len(sents) == 0:
            continue
        
        doc.add_heading(_dict[talent], level=2)

        sum_sents_buffer = []
        
        for sent in sents:
            if sent not in no_duplicate_sents:
                no_duplicate_sents.append(sent)
                sum_sents_buffer.append(sent)
        
        for sent in sum_sents_buffer:
#             doc.add_paragraph(sent)
            doc.add_paragraph(sent, style='List Bullet')
            
#     for talent in default_talents:
#         if talent not in talents:
#             doc.add_heading(talent, level=2)
#             doc.add_paragraph("無")
            
    return doc

In [34]:
debug_tuple = [
    "# The content is removed due to confidential concerns."
]

In [35]:
## STEP 0: load the summary data
if COMMENT_AUGMENTATION:
    significance_summary_docx_dir = os.path.join(
        P.FP_SIGNIFICANCE_SUMMARY_DIR, TRAIN_OR_ALL, 'w', SIGNIFICANCE_MODEL_SAVE_DIR_NAME, "epoch_{}_max_sent_{}".format(EPOCH, MAX_SENT)
    )
    summary_docx_dir = os.path.join(
        P.FP_SUMMARY_GENERATION_DIR, TRAIN_OR_ALL, 'w', SIGNIFICANCE_MODEL_SAVE_DIR_NAME, "epoch_{}_max_sent_{}".format(EPOCH, MAX_SENT)
    )
else:
    significance_summary_docx_dir = os.path.join(
        P.FP_SIGNIFICANCE_SUMMARY_DIR, TRAIN_OR_ALL, 'wo', SIGNIFICANCE_MODEL_SAVE_DIR_NAME, "epoch_{}_max_sent_{}".format(EPOCH, MAX_SENT)
    )
    summary_docx_dir = os.path.join(
        P.FP_SUMMARY_GENERATION_DIR, TRAIN_OR_ALL, 'wo', SIGNIFICANCE_MODEL_SAVE_DIR_NAME, "epoch_{}_max_sent_{}".format(EPOCH, MAX_SENT)
    )
    
uniqueness_summary_docx_dir = os.path.join(
    P.FP_UNIQUENESS_SUMMARY_DIR, TRAIN_OR_ALL
)

In [36]:
if not os.path.exists(summary_docx_dir):
    os.makedirs(summary_docx_dir)

In [None]:
significance_summary_docx_dir

In [None]:
uniqueness_summary_docx_dir

In [None]:
summary_docx_dir

In [40]:
## load significance summary data
fn_significance_summary_dict = os.path.join(significance_summary_docx_dir, "summary_dict.pkl")
with open(fn_significance_summary_dict, 'rb') as f:
    significance_summary_dict = pickle.load(f)
    
## load uniqueness summary data
fn_uniqueness_summary_dict = os.path.join(uniqueness_summary_docx_dir, "uniqueness_summary_dict.pkl")
with open(fn_uniqueness_summary_dict, 'rb') as f:
    uniqueness_summary_dict = pickle.load(f)

In [None]:
uniqueness_summary_docx_dir

In [None]:
len(significance_summary_dict), len(uniqueness_summary_dict)

In [43]:
debug = False

## Load applicant info

In [55]:
df_comments = D.read_df_comments()
df_applicants = D.read_df_applicants()
test_df = pd.read_csv("112_F_experiment.csv")

In [56]:
df_applicants = pd.concat([df_applicants, test_df])

In [57]:
applicant_group_info = {}

if VAL_OR_TEST == 'test':
    for _, row in df_applicants.iterrows():
        _year = row['year']
        _id = row['id']
        _name = row['name']
        _group = row['group']
        
        applicant_group_info[(_year, _id, _name)] = _group

In [58]:
debug_tuple = [(112, 23010512, '莊詠宸')]

In [None]:
pp_summary_dict = {}
pp_summary_no_tag_dict = {}

i = 0

for info, sig_summary_info in tqdm(significance_summary_dict.items()):
    if info not in uniqueness_summary_dict.keys():
        continue
    
    ## STEP 1: Post process summary
    _year = info[0]
    _id = info[1]
    _name = info[2]
    _group = applicant_group_info[info]

    if debug:
        if info not in debug_tuple:
            continue

    i += 1
            
    uni_summary = uniqueness_summary_dict[info]
            
    sig_summary = sig_summary_info['summary']
    title_weight = sig_summary_info['title_weight']

    if debug:
        print(info)
        print("significance summary before post process")
        print(sig_summary)
        print("="*10)

    pp_summary = defaultdict(list)
    pp_summary_no_tag = defaultdict(list)
    buf_summary = copy.deepcopy(sig_summary)
    
    ## append uniqueness summary
    cnt = 0
    while cnt < UNI_MAX_SENT and len(uni_summary) > 0:
        uni_summary = [sent for sent in uni_summary if not PP.is_empty_sent(sent)]
        
        if len(uni_summary) == 0:
            break
        
        uni_summary_sent_embed = sbert_model.encode(uni_summary, show_progress_bar=False)
        
        buf_summary_sent = list(chain.from_iterable(buf_summary.values()))
        buf_summary_sent = [sent for sent in buf_summary_sent if not PP.is_empty_sent(sent)]
        
        if len(buf_summary_sent) == 0:
            buf_summary['獨特表現'].append(uni_summary[0])
            sig_summary['獨特表現'].append(uni_summary[0])
            cnt += 1
            uni_summary[0] = ''
            uni_summary = [sent for sent in uni_summary if not PP.is_empty_sent(sent)]
            continue
        
        buf_summary_sent_embed = sbert_model.encode(buf_summary_sent, show_progress_bar=False)
        
#         print(uni_summary)
        
        sim_mat = np.array(cos_sim(uni_summary_sent_embed, buf_summary_sent_embed))
        uni_sent_disimilarity = 1 - np.mean(sim_mat, axis=-1)
        
        idx = np.argmax(uni_sent_disimilarity)
        
        if debug:
            print("disimilarity", uni_sent_disimilarity[idx])
            print("sent", uni_summary[idx])
            print("max sim", np.max(sim_mat[idx]))
        
        ## append unique sentence that is most disimilar to significance summary
        if np.max(sim_mat[idx]) < 0.94:
            buf_summary['獨特表現'].append(uni_summary[idx])
            sig_summary['獨特表現'].append(uni_summary[idx])
            cnt += 1
        uni_summary[idx] = ''
        uni_summary = [sent for sent in uni_summary if not PP.is_empty_sent(sent)]
    
    if debug:
        IO.print_dividing_line()
    
    for talent, sum_sents in sig_summary.items():
        for sum_sent in sum_sents:
            _bidx = buf_summary[talent].index(sum_sent)
            rl_infos = rl_info_dict[(_year, _id)]
            sent_evidence = sent_evidence_score_dict[info]

            tag = ""

            if sum_sent in rl_infos.keys():
                ## 1. add citations (only for rl)
                rl_info = rl_infos[sum_sent]

                if info != '':
                    tag = rl_info
            else:
                ## 2. add verified mark (only for not rl)
                evidence = sent_evidence[sum_sent]

                if evidence > EVIDENCE_SCORE_THRESHOLD:
                    tag = "已驗證"

            ## trim sentence that is too long
            ## get the length of the sentence
            sum_sent_len = get_sent_len(sum_sent)

            if sum_sent_len > MAX_SENT_LEN:                
                if debug:
                    print('sum_sent_to_trim:', sum_sent)

                debug_info = all_chunk_debug_info_buffer[info]

                ext_summary = list(chain.from_iterable(buf_summary.values()))
                ext_summary = [sent for sent in ext_summary if sent != sum_sent]
                ext_summary = '。'.join(ext_summary)

                if debug:
                    print('ext_summary:', ext_summary)
                sum_sent = long_sentence_post_process(sum_sent, debug_info, ext_summary, debug)

                if debug:
                    print('sum_sent_after_trim:', sum_sent, "len:", get_sent_len(sum_sent))
                    print('-'*10)

            sum_sent = sentence_post_process(sum_sent)
            ## replace sum sent in original summary
            buf_summary[talent][_bidx] = sum_sent

            ## [TODO] process sentence with unclosed parenthesis (remove unreasonable sentence)
            if sum_sent == '':
                continue

            pp_summary_no_tag[talent].append(sum_sent)
                
#             if tag:
#                 sum_sent = "{}（{}）".format(sum_sent, tag)

            pp_summary[talent].append(sum_sent)

    talent_list = sorted(title_weight.items(), key=lambda i: -i[1])
    talent_list = [t[0] for t in talent_list]
    talent_list.append('獨特表現')
    
    if debug:
        print('='*10)
        print("summary after post process")
        print(pp_summary)
        IO.print_dividing_line()

    pp_summary_dict[info] = pp_summary
    pp_summary_no_tag_dict[info] = pp_summary_no_tag
    
#     if not debug:
#         ## STEP 2: Write post processed summary to docx file
#         doc = Document()
#         doc = generate_docx(doc, pp_summary, talent_list)

#         fn = "{}.docx".format("_".join(map(str, info)))
#         if debug:
#             print(fn)
            
#         if VAL_OR_TEST == 'all':
#             doc.save(os.path.join(summary_docx_dir, fn))
#         elif VAL_OR_TEST == 'test':
#             _group_dir = os.path.join(summary_docx_dir, _group, 'docx')
            
#             if not os.path.exists(_group_dir):
#                 os.mkdir(_group_dir)
            
#             doc.save(os.path.join(_group_dir, fn))
            
            
i

In [None]:
pp_summary_no_tag_dict

### Add our summary to test_df

In [67]:
our_summary_list = []

for _, row in test_df.iterrows():
    _year = row['year']
    _id = row['id']
    _name = row['name']
    
    summary_dict = pp_summary_no_tag_dict[(_year, _id, _name)]
    summary = list(chain.from_iterable(summary_dict.values()))
    summary = ''.join(summary)
    
    our_summary_list.append(summary)

In [69]:
test_df['our summary'] = our_summary_list

In [None]:
test_df.head()

## Quantitative Experiments

In [71]:
df_test_comments = pd.read_csv('112_comments.csv')

In [None]:
df_test_comments.head()

In [None]:
df_test_comments.iloc[0][4:].to_list()

### Statistics for comments

In [None]:
comments_list = []

for _, row in test_df.iterrows():
    _year = row['year']
    _id = row['id']
    _name = row['name']
    
    ## query comments
    row = df_test_comments.query("`year` == @_year and `id` == @_id")
    
    comments = []
    
    if len(row) > 0:
        for i in range(4, 4+5):
            comments.append(row.iat[0, i])
        comments = [c for c in comments if not PP.is_empty_sent(c)]
        
    print(comments)
    print(_year, _id, _name)
    print(len(comments))
    
    comments_list.append(comments)

## Avg len

In [168]:
def calculate_word_count(ss):
    if not ss:
        return 0
    
    ## check the language of the document
    zh_char_count = sum([1 for ch in ss if PP.is_zh_character(ch)])
    zh_char_rate = zh_char_count / len(ss)
    
    if zh_char_rate < 0.1: ## english document preprocess
        tokens = len(ss.split(' '))
    else: ## chinese document preprocess
        tokens = len(ss)

    return tokens

In [177]:
test_comments = list(chain.from_iterable(test_df['comments'].to_list()))

In [None]:
pd.Series(list(map(calculate_word_count, test_comments))).describe()

## Avg comment per application

In [113]:
test_df['comments'] = comments_list

In [None]:
test_df['comments'].apply(len).describe()

### Prepare data pair

In [None]:
test_df.columns

In [138]:
row_data = []
our_summary_test = []
chatgpt_summary_test = []
comment_test = []

for _, row in test_df.iterrows():
    ours = row['chat gpt summary']
    chatgpt = row['our summary']
    comments = row['comments']
    
    for c in comments:
        our_summary_test.append(ours)
        chatgpt_summary_test.append(chatgpt)
        comment_test.append(c)
        row_data.append({
            'chatgpt': chatgpt,
            'ours': ours,
            'comment': c
        })

In [None]:
df_test_pair = pd.DataFrame(row_data)
df_test_pair.shape

In [None]:
len(our_summary_test), len(chatgpt_summary_test), len(comment_test)

## Cosine Similarity

In [182]:
from sentence_transformers.util import cos_sim

In [183]:
our_summary_test_embed = sbert_model.encode(our_summary_test)
chatgpt_summary_test_embed = sbert_model.encode(chatgpt_summary_test)
comment_test_embed = sbert_model.encode(comment_test)

In [193]:
## ours
cos_sim(our_summary_test_embed, comment_test_embed).diagonal().

tensor([0.9260, 0.7706, 0.4829, 0.8467, 0.7977, 0.7653, 0.7873, 0.8894, 0.8538,
        0.6683, 0.7522, 0.9457, 0.8995, 0.8650, 0.8305, 0.8306, 0.9190, 0.8940,
        0.8617, 0.9130, 0.6883, 0.8256, 0.9646, 0.8966, 0.9221, 0.8510, 0.8722,
        0.8225, 0.7977, 0.8136, 0.9149, 0.8047, 0.7436, 0.9153, 0.9401, 0.8621,
        0.8346, 0.9183, 0.8446, 0.8653, 0.8705, 0.7727, 0.8741, 0.8380, 0.8270,
        0.6944, 0.9113, 0.8405, 0.8368, 0.9052, 0.5723, 0.5894, 0.8876, 0.8496,
        0.8417, 0.7801, 0.8857, 0.8595, 0.9413, 0.9128, 0.8731, 0.9409, 0.7675,
        0.7889, 0.8755, 0.8966, 0.9276, 0.8308, 0.8685, 0.9420, 0.9324, 0.8664,
        0.8071, 0.9354, 0.9565, 0.9002, 0.8960, 0.7641])

### BERTScore

In [121]:
from bert_score import score

In [122]:
def calculate_bert_score(cands, refs, rescale=False, verbose=False):
    return score(
        cands,
        refs,
        lang="zh",
    #     model_type=MODEL_TYPE,
    #     num_layers=LAYER,
        verbose=verbose,
        device=0,
        batch_size=64,
    #     idf=False,
        rescale_with_baseline=rescale
    )

In [123]:
## ours
ours_bs_P, ours_bs_R, ours_bs_F1 = \
    calculate_bert_score(our_summary_test, comment_test, rescale=False)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [126]:
print("Ours")
    
print("p: {:4f}".format(torch.mean(ours_bs_P)))
print("r: {:4f}".format(torch.mean(ours_bs_R)))
print("f: {:4f}".format(torch.mean(ours_bs_F1)))

Ours
p: 0.526666
r: 0.628861
f: 0.572000


In [124]:
## chatgpt
chatgpt_bs_P, chatgpt_bs_R, chatgpt_bs_F1 = \
    calculate_bert_score(chatgpt_summary_test, comment_test, rescale=False)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [127]:
print("ChatGPT")
    
print("p: {:4f}".format(torch.mean(chatgpt_bs_P)))
print("r: {:4f}".format(torch.mean(chatgpt_bs_R)))
print("f: {:4f}".format(torch.mean(chatgpt_bs_F1)))

ChatGPT
p: 0.517770
r: 0.632340
f: 0.568303


### ROUGE-r

In [131]:
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [133]:
def get_tokenized_sentences(batch_sent, return_sent=False):
    batch_tokens = tokenizer(
        batch_sent, 
        return_tensors=None, 
        return_token_type_ids=False, 
        return_attention_mask=False,
        add_special_tokens=False
    )
    
    batch_tokens = batch_tokens['input_ids']
    tokenized_sents_id = [' '.join(str(_id) for _id in tokens) for tokens in batch_tokens]
    tokenized_sents = tokenizer.batch_decode(batch_tokens)
    
    if return_sent:
        return tokenized_sents_id, tokenized_sents
    else:
        return tokenized_sents_id

In [134]:
from rouge_score import rouge_scorer

In [135]:
rouge_metrics = ['rouge1', 'rouge2', 'rougeL']
cls_metrics = ['precision', 'recall', 'f1']

scorer = rouge_scorer.RougeScorer(rouge_metrics)

In [136]:
def calculate_rouge_score(cand, ref):
    ## preprocess
    ref = get_tokenized_sentences([ref])[0]
    cand = get_tokenized_sentences([cand])[0]
        
    return scorer.score(ref, cand)  ## reference, candidate

### Prepare data

In [None]:
## ours

df_rouge_ours = pd.json_normalize(df_test_pair.progress_apply(
    lambda row: calculate_rouge_score(row['ours'], row['comment']), axis=1
))

In [None]:
## chatgpt

df_rouge_chatgpt = pd.json_normalize(df_test_pair.progress_apply(
    lambda row: calculate_rouge_score(row['chatgpt'], row['comment']), axis=1
))

In [164]:
# rouge_metrics: 'rouge1', 'rouge2', 'rougeL'
# cls_metrics: 'precision', 'recall', 'f1'
score_dict = {}

for rm in rouge_metrics:
    rouge_score_dict = {}
    r_scores = df_rouge_ours[rm]
    
    for i, cm in enumerate(cls_metrics):
        scores = r_scores.apply(lambda t: t[i])
        score = scores.mean()
        rouge_score_dict[cm] = score
        
    score_dict[rm] = rouge_score_dict

print("Ours")
pd.DataFrame(score_dict)

Ours


Unnamed: 0,rouge1,rouge2,rougeL
precision,0.087311,0.028224,0.055257
recall,0.460914,0.139751,0.319635
f1,0.13557,0.042953,0.086609


In [165]:
# rouge_metrics: 'rouge1', 'rouge2', 'rougeL'
# cls_metrics: 'precision', 'recall', 'f1'
score_dict = {}

for rm in rouge_metrics:
    rouge_score_dict = {}
    r_scores = df_rouge_chatgpt[rm]
    
    for i, cm in enumerate(cls_metrics):
        scores = r_scores.apply(lambda t: t[i])
        score = scores.mean()
        rouge_score_dict[cm] = score
        
    score_dict[rm] = rouge_score_dict

print("ChatGPT")
pd.DataFrame(score_dict)

ChatGPT


Unnamed: 0,rouge1,rouge2,rougeL
precision,0.08179,0.025533,0.050778
recall,0.459849,0.133773,0.311973
f1,0.126288,0.038353,0.07907
