# README

### Purpose of this notebook
- Predict the topic of the application sentences with respect to the comment cluster using K-Nearest Neighbor algorithm (KNN).
- Form the summary based on the above prediction.

### Steps
1. Get the reduced embeddings and cluster label from the comment sentences.
2. Apply KNN algorithm to predict the label and the confidence of application sentences.
3. Generate summary according to the results from step 2.

In [None]:
import pandas as pd
import numpy as np
from itertools import chain
import os

from tqdm import tqdm
tqdm.pandas(desc="progress: ")

from importlib import reload

# Utility variable
import sys, getopt
sys.path.insert(0, '../..')

# var
import var.path as P

# utils
import utils.data as D
import utils.preprocess as PP
import utils.io as IO

## Hyper-parameters

## Test tuple setup

In [None]:
# random pick test data
df_applicants = D.read_df_applicants()
df_applications = D.read_df_applications()
test_df = pd.read_csv("112_F_experiment.csv")
test_df['train_or_test'] = 'true_test'

In [None]:
df_applicants = pd.concat([df_applicants, test_df])
df_applicants

In [None]:
# df_applicants = test_df

In [None]:
df_applications = pd.merge(
    df_applications, df_applicants[['year', 'id', 'name', 'train_or_test']], how='left', on=['year', 'id']
)

In [None]:
df_applications.shape

In [None]:
df_applications.head()

In [None]:
df_applications.shape

In [None]:
df_applications['train_or_test'].value_counts()

### Number of pages

In [None]:
df_applications[df_applications['train_or_test'] == 'train'].num_pages.describe()

In [None]:
df_applications[df_applications['train_or_test'] == 'test'].num_pages.describe()

In [None]:
df_applications[df_applications['train_or_test'] == 'true_test'].num_pages.describe()

### Self-statement length

In [None]:
def calculate_word_count(ss):
    if not ss:
        return 0
    
    if type(ss) == list:
        ss = list(chain.from_iterable(ss))
        ss = ''.join(ss)
    
    ## check the language of the document
    zh_char_count = sum([1 for ch in ss if PP.is_zh_character(ch)])
    zh_char_rate = zh_char_count / len(ss)
    
    if zh_char_rate < 0.1: ## english document preprocess
        tokens = len(ss.split(' '))
    else: ## chinese document preprocess
        tokens = len(ss)

    return tokens

In [None]:
df_applications['ss_len'] = df_applications['self_statement'].progress_apply(calculate_word_count)

In [None]:
df_applications[df_applications['train_or_test'] == 'train'].ss_len.describe()

In [None]:
df_applications[df_applications['train_or_test'] == 'test'].ss_len.describe()

In [None]:
df_applications[df_applications['train_or_test'] == 'true_test'].ss_len.describe()

### Recommendation letters lenght

In [None]:
df_recommendation_letters = D.read_df_recommendation_letters()

In [None]:
def get_chunks_and_sents_and_refs_from_recommendation_letter(row):
    _year = row['year']
    _id = row['id']
    rows = df_recommendation_letters.query('`year` == {} and `id` == {}'.format(_year, _id))
    
    try:
        rls_sents = rows['all_paragraph_sent'].to_list()
    except:
        rls_sents = []
        
    if rls_sents == None:
        rls_sents = []
        
    sents = []
    for rl_sents in rls_sents:
        for sent in rl_sents:
            sents.append(sent)
            
    return sents

In [None]:
df_applications['rl_sents'] = df_applications.apply(get_chunks_and_sents_and_refs_from_recommendation_letter, axis=1)

In [None]:
df_applications['rl_len'] = df_applications['rl_sents'].apply(calculate_word_count)

In [None]:
df_applications[df_applications['train_or_test'] == 'train'].rl_len.describe()

In [None]:
df_applications[df_applications['train_or_test'] == 'test'].rl_len.describe()

In [None]:
df_applications[df_applications['train_or_test'] == 'true_test'].rl_len.describe()

### All document length

In [None]:
df_applications['all_len'] = df_applications['application_pages'].progress_apply(calculate_word_count)

In [None]:
df_applications[df_applications['train_or_test'] == 'train'].all_len.describe()

In [None]:
df_applications[df_applications['train_or_test'] == 'test'].all_len.describe()

In [None]:
df_applications[df_applications['train_or_test'] == 'true_test'].all_len.describe()

## Comments

In [None]:
# df_applications = pd.merge(
#     df_applications, df_applicants[['year', 'id', 'name']], how='right', on=['year', 'id']
# )

In [None]:
df_applications

In [None]:
len(df_applications)

In [None]:
df_applications.name = df_applications.name.fillna('?')

In [None]:
tuples = df_applications.apply(lambda row: (row['year'], row['id'], row['name']), axis=1).to_list()

In [None]:
tuples = [{
    'year': info[0],
    'id': info[1],
    'name': info[2],
} for info in tuples]

In [None]:
len(tuples)

In [None]:
tuples = tuples[START_IDX:END_IDX]

In [None]:
len(tuples)

In [None]:
def dict_info_to_tuple_info(dict_info):
    _year = dict_info['year']
    _id = dict_info['id']
    _name = dict_info['name']
    tuple_info = (_year, _id, _name)
    
    return tuple_info

In [None]:
DATA_SHEET_SUMMARY_WEIGHT = {
    'knn_conf': 0,
    'topic_match': 1,
    'claim': 0,
    'future_plan': 0,
    'evidence': 2,
    'uniqueness': 0,
}

SELF_STATEMENT_SUMMARY_WEIGHT = {
    'knn_conf': 0,
    'topic_match': 1,
    'claim': 2,
    'future_plan': 0.25,
    'evidence': 2,
    'uniqueness': 0,
}

RECOMMENDATION_LETTER_SUMMARY_WEIGHT = {
    'knn_conf': 0,
    'topic_match': 1,
    'claim': 2,
    'future_plan': 0,
    'evidence': 0,
    'uniqueness': 0,
}

In [None]:
def defaultdict_init_defaultdict_init_by_int():
    return defaultdict(int)

def defaultdict_init_defaultdict_init_by_float():
    return defaultdict(float)

In [None]:
empty_candidate_sents_info = {
    "sents": [],
    "sents_avg_importance_dict": {},
    "sents_topic_importance_dict": {},
    "sents_topic_id_dict": {},
    "topic_sent_dict": {},
    "refs": {},
}

empty_chunk_debug_info = {
    "chunks": [],
    "predicted_topics": [],
    "predicted_knn_confs": [],
    "predicted_neighbors_sc_idx": [],
    "knn_confidence": [],
    "topic_match_score": [],
    "claim_score": [],
    "future_plan_score": [],
    "evidence_score": [],
    "uniqueness_score": [],
    "importance": [],
    "refs": {},
}

## Get the reduced embeddings and labels from comment sentences

### Load BERTopic Model

In [None]:
topic_doc_tokenizer = BT.topic_doc_tokenizer
custom_update_topics = BT.custom_update_topics

In [None]:
topic_model = BERTopic.load(
    os.path.join(P.FP_COMMENT_CLUSTERING_MODEL_DIR, BERTOPIC_MODEL_NAME),
)

In [None]:
sbert_model = topic_model.embedding_model.embedding_model

### Fetch class label and representatives for each topic

In [None]:
topic_rep_dict = topic_model.get_representative_docs()

In [None]:
topic_rep_dict[-1] = ['0', '0', '0']

In [None]:
topic_rep_dict

In [None]:
topic_class_label = {}

In [None]:
topic_info = topic_model.get_topic_info()

In [None]:
def extract_topic_class_label(topic_rep):
    chunks = topic_rep.split('_')
    tid = int(chunks[0])
    topic_class_label[tid] = chunks[1:]

In [None]:
_ = topic_info['Name'].apply(extract_topic_class_label)

In [None]:
topic_class_label

### Load comment sentences

In [None]:
df_split_comments = D.read_df_split_comments_no_duplicate(TRAIN_OR_ALL)
split_comments = D.read_split_comments_no_duplicate(TRAIN_OR_ALL)
df_tokenization_database = df_split_comments

In [None]:
len(df_split_comments)

In [None]:
len(split_comments)

### Get reduced embeddings

In [None]:
reduced_split_comments_embeds = topic_model.umap_model['umap'].embedding_
reduced_split_comments_embeds = topic_model.umap_model['norm'].transform(reduced_split_comments_embeds)
reduced_split_comments_embeds.shape

### Get the topic labels

In [None]:
import hdbscan

In [None]:
# %%time
topic_labels = topic_model.hdbscan_model.labels_
topic_labels = topic_model._map_predictions(topic_labels)

### Get the sentiment of the comment

In [None]:
# from transformers import BertForSequenceClassification
# from transformers import BertTokenizer

In [None]:
# from torch.utils.data import Dataset
# from torch.utils.data import DataLoader
# from torch import Tensor

In [None]:
# sentiment_analysis_model_name = 'IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment'

# sentiment_analysis_tokenizer = BertTokenizer.from_pretrained(sentiment_analysis_model_name)
# sentiment_analysis_model = BertForSequenceClassification.from_pretrained(sentiment_analysis_model_name).to(device)

In [None]:
# def sentiment_analysis_inference(text):
#     dataset = Tor.BatchSentenceDataset(text)
#     dataloader = DataLoader(dataset, batch_size=16, shuffle=False)
    
#     prob_batch = []
#     with torch.no_grad():
#         for batch in dataloader:
#             encoding = sentiment_analysis_tokenizer(batch, padding=True, return_tensors='pt', truncation='longest_first', max_length=510)

#             for key in encoding:
#                 if isinstance(encoding[key], Tensor):
#                     encoding[key] = encoding[key].to(device)

#             output = sentiment_analysis_model(**encoding)
#             postive_prob = torch.nn.functional.softmax(output.logits, dim=-1)[:, 1]
#             prob_batch.append(postive_prob)
            
#     postive_probs = torch.cat(prob_batch)
#      ## -1 represent negative, 1 represent neutral or positive
#     sentiment_label = [1 if p > 0.3 else -1 for p in postive_probs]
    
#     return sentiment_label

In [None]:
# import utils.torch as Tor

In [None]:
# # %%time
# split_comments_sentiment = sentiment_analysis_inference(split_comments)

In [None]:
# import pickle

# with open('./train_split_comments_sentiment.pkl', 'wb') as f:
#     pickle.dump(split_comments_sentiment, f)

In [None]:
import pickle

In [None]:
split_comments_sentiment = D.read_split_comments_sentiment(TRAIN_OR_ALL)

In [None]:
len(split_comments_sentiment)

# Application Inference
Predict the topic of the application sentences with respect to the comment cluster using K-Nearest Neighbor algorithm (KNN)

## Read Dataframe

### Read achievements

In [None]:
df_achievements = D.read_df_achievements()

In [None]:
df_achievements.head()

### Read self-statement

In [None]:
df_applications.tail()

### Read recommendation-letter 

In [None]:
df_recommendation_letters = D.read_df_recommendation_letters()

In [None]:
df_recommendation_letters.tail()

### Read Summaries

In [None]:
df_summary = D.read_df_summary()

In [None]:
df_summary.tail()

## Summary Generation Utility

In [None]:
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid

In [None]:
from itertools import chain

In [None]:
def get_evidences(_year, _id):
    rows = df_recommendation_letters.query('`year` == {} and `id` == {}'.format(_year, _id))
    
    try:
        rls_chunks = rows['all_paragraph_chunk'].to_list()
    except:
        rls_chunks = []
        
    if rls_chunks == None:
        rls_chunks = []
        
    chunks = list(chain.from_iterable(rls_chunks))
            
    return chunks

In [None]:
def get_recommendation_letter_uniqueness_ref_sents(_year, _id):
    row = df_achievements.query('`year` == {} and `id` == {}'.format(_year, _id))
    
    try:
        ref_sents = row['achievement'].to_list() + row['self_statement_sent'].to_list()[0]
    except:
        ref_sents = []
        
    if ref_sents == None:
        ref_sents = []
        
    return ref_sents

In [None]:
def get_topic_prediction(topic_model, chunks, n_neighbors, method="k", radius=0.02):
    if method == "k":
        neigh = KNeighborsClassifier(n_neighbors=n_neighbors)
        neigh.fit(reduced_split_comments_embeds, topic_labels)
    elif method == "r":
        neigh = KNeighborsClassifier(radius=radius, outlier_label=-1)
        neigh.fit(reduced_split_comments_embeds, topic_labels)
        
    ## get reduce chunk embeddings
    chunk_embeds = topic_model.embedding_model.embed(chunks)
    chunk_reduced_embeds = topic_model.umap_model.transform(chunk_embeds)
    ## predict topic and confidence
    predicted_topics = neigh.predict(chunk_reduced_embeds)
    predicted_confs = neigh.predict_proba(chunk_reduced_embeds)
    predicted_neighbors_idx = neigh.kneighbors(chunk_reduced_embeds, n_neighbors=n_neighbors, return_distance=False)
    
    return predicted_topics, predicted_confs, predicted_neighbors_idx

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def calculate_uniqueness_score(chunks, predicted_neighbors_sc_idx):
    chunks_embed = sbert_model.encode(chunks, batch_size=128, show_progress_bar=False)
    scores = []
    
    for chunk_embed, pred_neigh_idx in zip(chunks_embed, predicted_neighbors_sc_idx):
        ## find the outlier comments from the neighbors
        outliers_idx = [_idx for _idx in pred_neigh_idx if topic_labels[_idx] == -1]
        ## filter out negative comments
        outliers_idx = [_idx for _idx in outliers_idx if split_comments_sentiment[_idx] == 1]
        outliers = [split_comments[_idx] for _idx in outliers_idx]
        
        if len(outliers) == 0:
            scores.append(0)
            continue
        
        ## compute the average cosine similarity among chunk and neighbors
        outliers_embed = sbert_model.encode(outliers, batch_size=128, show_progress_bar=False)
        avg_sim = cosine_similarity([chunk_embed], outliers_embed).mean()
        
        scores.append(avg_sim)
        
    return scores

In [None]:
def calculate_candidate_sents_score(
    chunks,
    sents,
    evidences,
    rl_uniqueness_refs,
    topic_model,
    imp_weights,
    debug,
    n_neighbors=25,
):
    ## Deal with empty corpus
    if chunks == None or sents == None or len(chunks) == 0 or len(sents) == 0:
        return empty_candidate_sents_info, empty_chunk_debug_info
    
    ## Predict topics on text chunks
    predicted_topics, predicted_knn_confs, predicted_neighbors_sc_idx = get_topic_prediction(
        topic_model, chunks, n_neighbors
    )
    
    ## get topic reps for each chunks
    topic_class_reps = [topic_rep_dict[tid] for tid in predicted_topics]
    
    skip_calculate_score = np.zeros(len(chunks))
    ## Calculate chunk candidates importance
    ## knn confidence
    knn_confidence = np.max(predicted_knn_confs, axis=1)
    ## topic matching score
    if imp_weights['topic_match'] > 0:
        topic_match_score = S.topic_match_score(chunks, topic_class_reps, batch_size=BATCH_SIZE)
    else:
        topic_match_score = skip_calculate_score
    ## claim score
    if imp_weights['claim'] > 0:
        claim_score = np.array([S.claim_score(c, chunks, batch_size=BATCH_SIZE) for c in chunks])
    else:
        claim_score = skip_calculate_score
    ## future plan score
    if imp_weights['future_plan'] > 0:
        future_plan_score = S.future_plan_score(chunks, batch_size=BATCH_SIZE)
    else:
        future_plan_score = skip_calculate_score
    ## evidence score
    if imp_weights['evidence'] > 0:
        evidence_score = S.evidence_score(chunks, evidences)
    else:
        evidence_score = skip_calculate_score
    ## [TODO] uniqueness score
    if imp_weights['uniqueness'] > 0:
        uniqueness_score = calculate_uniqueness_score(chunks, predicted_neighbors_sc_idx)
    else:
        uniqueness_score = skip_calculate_score
    
    ## [TODO] recommendation letter uniqueness score
#     if imp_weights['rl_uniqueness'] > 0:
#         uniqueness_score = S.uniqueness_score(chunks, rl_uniqueness_refs, predicted_topics, predicted_neighbors_sc_idx)
#     else:
#         uniqueness_score = skip_calculate_score
    
    ## importance
    importance = (\
        imp_weights['knn_conf'] * knn_confidence + \
        imp_weights['topic_match'] * topic_match_score + \
        imp_weights['claim'] * (1 - claim_score) + \
        imp_weights['future_plan'] * (1 - future_plan_score) + \
        imp_weights['evidence'] * evidence_score + \
        imp_weights['uniqueness'] * uniqueness_score \
     ) / sum(imp_weights.values())
    
    sents_avg_importance_dict = defaultdict(list)
    sents_topic_importance_dict = defaultdict(defaultdict_init_defaultdict_init_by_float)
    topic_sent_dict = defaultdict(set)
    sents_topic_id_dict = defaultdict(defaultdict_init_defaultdict_init_by_int)
    ## Aggregate sentence importance score over chunk importance score
    for chunk, imp, topic in zip(chunks, importance, predicted_topics):
        ## find the sentence cotaining the chunk
        for sent in sents:
            ## aggregate chunk importance
            ## [TODO] use df_comment to refine the result (because split chunk may not be exact match to sent)
            if chunk in sent:
                sents_avg_importance_dict[sent].append(imp)
                sents_topic_importance_dict[sent][topic] += imp
                topic_sent_dict[topic].add(sent)
                sents_topic_id_dict[sent][topic] += 1
                
    ## Calculate the importance score of the sentence
    sents_avg_importance_dict = {
        sent: np.mean(imp_list) for sent, imp_list in sents_avg_importance_dict.items()
    }
    
    if debug:
        for sent, imp in sents_avg_importance_dict.items():
            print(sent, imp, sents_topic_id_dict[sent]) 
    
    ## remember to update the following variable and function if the key values are changed
    ## empty_candidate_sents_info
    ## merge_candidate_sents_info
    candidate_sents_info = {
        "sents": sents,
        "sents_avg_importance_dict": sents_avg_importance_dict,
        "sents_topic_importance_dict": sents_topic_importance_dict,
        "sents_topic_id_dict": sents_topic_id_dict,
        "topic_sent_dict": topic_sent_dict,
    }
    
    ## remember to update the following variable and function if the key values are changed
    ## empty_chunk_debug_info
    ## merge_chunk_debug_info
    chunk_debug_info = {
        "chunks": chunks,
        "predicted_topics": predicted_topics,
        "predicted_knn_confs": predicted_knn_confs,
        "predicted_neighbors_sc_idx": predicted_neighbors_sc_idx,
        "knn_confidence": knn_confidence,
        "topic_match_score": topic_match_score,
        "claim_score": claim_score,
        "future_plan_score": future_plan_score,
        "evidence_score": evidence_score,
        "uniqueness_score": uniqueness_score,
        "importance": importance,
    }
    
    return candidate_sents_info, chunk_debug_info

In [None]:
def find_summary_candidate_pipe(info, get_chunks_and_sents_and_refs_func, imp_weights, doc_source, debug=False):
    ## get basic info
    _year = info['year']
    _id = info['id']
    _name = info['name']
    idx = (_year, _id, _name)
    
#     print(idx)
    
    ## get chunks and sents
    chunks, sents, ref_dict = get_chunks_and_sents_and_refs_func(_year, _id)
    ## get evidences, currently only return chunks from recommendation letter
    evidences = get_evidences(_year, _id)
    ## get recommendation letter uniqueness references, 
    ## i.e. sentences from data sheet and self-statement
    rl_uniqueness_refs = get_recommendation_letter_uniqueness_ref_sents(_year, _id)
    ## [TODO] calculate importance score for each summary
    candidate_sents_info, chunk_debug_info = calculate_candidate_sents_score(
        chunks, sents, evidences, rl_uniqueness_refs, topic_model, imp_weights, debug
    )
    candidate_sents_info['refs'] = ref_dict
    chunk_debug_info['refs'] = ref_dict
    
    return candidate_sents_info, chunk_debug_info

In [None]:
def merge_candidate_sents_info(old_info, new_info):
    ## if old info is empty, return new info
    if old_info == {} or old_info['sents'] == []:
        return new_info
    
    ## if new info is empty, return old info
    if new_info['sents'] == []:
        return old_info
    
    info = {}
    
    info['sents'] = old_info['sents'] + new_info['sents']
    info['sents_avg_importance_dict'] = old_info['sents_avg_importance_dict'] | new_info['sents_avg_importance_dict']
    info['sents_topic_id_dict'] = old_info['sents_topic_id_dict'] | new_info['sents_topic_id_dict']
    info['sents_topic_importance_dict'] = old_info['sents_topic_importance_dict'] | new_info['sents_topic_importance_dict']
    info['refs'] = old_info['refs'] | new_info['refs']

    info['topic_sent_dict'] = old_info['topic_sent_dict']
    for topic, sents in new_info['topic_sent_dict'].items():
        old_topic_set = old_info['topic_sent_dict'][topic]
        new_topic_set = new_info['topic_sent_dict'][topic]
        info['topic_sent_dict'][topic] = old_topic_set.union(new_topic_set)
    
    return info

In [None]:
def merge_chunk_debug_info(old_info, new_info):
    ## if old info is empty, return new info
    if old_info == {} or old_info['chunks'] == []:
        return new_info
    
    ## if new info is empty, return old info
    if new_info['chunks'] == []:
        return old_info
    
    info = {}
    
    info['chunks'] = old_info['chunks'] + new_info['chunks']
    info['predicted_topics'] = np.concatenate((old_info['predicted_topics'], new_info['predicted_topics']))
    info['predicted_knn_confs'] = np.concatenate((old_info['predicted_knn_confs'], new_info['predicted_knn_confs']))
    info['predicted_neighbors_sc_idx'] = np.concatenate((old_info['predicted_neighbors_sc_idx'], new_info['predicted_neighbors_sc_idx']))
    info['knn_confidence'] = np.concatenate((old_info['knn_confidence'], new_info['knn_confidence']))
    info['topic_match_score'] = np.concatenate((old_info['topic_match_score'], new_info['topic_match_score']))
    info['claim_score'] = np.concatenate((old_info['claim_score'], new_info['claim_score']))
    info['future_plan_score'] = np.concatenate((old_info['future_plan_score'], new_info['future_plan_score']))
    info['evidence_score'] = np.concatenate((old_info['evidence_score'], new_info['evidence_score']))
    info['uniqueness_score'] = np.concatenate((old_info['uniqueness_score'], new_info['uniqueness_score']))
    info['importance'] = np.concatenate((old_info['importance'], new_info['importance']))
    info['refs'] = old_info['refs'] | new_info['refs']
    
    return info

## Find summaries sentence candidates

### Find candidates from data sheet

In [None]:
candidate_sents_info_buffer = defaultdict(dict)
chunk_debug_info_buffer = defaultdict(dict)

In [None]:
def get_chunks_and_sents_and_refs_from_data_sheet(_year, _id):
    row = df_achievements.query('`year` == {} and `id` == {}'.format(_year, _id))
    
    try:
        chunks = row['achievement'].to_list()
        ## [TODO] deal with nan achievement result
        sents = ["{}，{}".format(a, r) for a, r in 
                 zip(row['achievement'].to_list(), row['achievement_result'].to_list())]
    except:
        chunks = []
        sents = []
        
    if chunks == None:
        chunks = []
    if sents == None:
        sents = []
        
    ref_dict = defaultdict(str)
        
    return chunks, sents, ref_dict

In [None]:
# %%time
## claim score + future plan score

IO.print_dividing_line()
IO.print_dividing_line("Processing data sheet ...")

for dict_info in tqdm(tuples):
    idx = dict_info_to_tuple_info(dict_info)
    candidate_sents_info, chunk_debug_info = find_summary_candidate_pipe(
        dict_info, get_chunks_and_sents_and_refs_from_data_sheet, 
        DATA_SHEET_SUMMARY_WEIGHT, '個人資料表', debug=DEBUG
    )

    candidate_sents_info_buffer[idx] = merge_candidate_sents_info(candidate_sents_info_buffer[idx], candidate_sents_info)
    chunk_debug_info_buffer[idx] = merge_chunk_debug_info(chunk_debug_info_buffer[idx], chunk_debug_info)

#     IO.print_dividing_line()

### Find candidates from self-statement

In [None]:
def get_chunks_and_sents_and_refs_from_self_statement(_year, _id):
    row = df_applications.query('`year` == {} and `id` == {}'.format(_year, _id))
    
    try:
        chunks = row['self_statement_chunk'].to_list()[0]
        sents = row['self_statement_sent'].to_list()[0]
    except:
        chunks = []
        sents = []

    if chunks == None:
        chunks = []
    if sents == None:
        sents = []
        
    ref_dict = defaultdict(str)
        
    return chunks, sents, ref_dict

In [None]:
# %%time
## claim score + future plan score

IO.print_dividing_line()
IO.print_dividing_line("Processing self-statement ...")

for dict_info in tqdm(tuples):
    idx = dict_info_to_tuple_info(dict_info)
    candidate_sents_info, chunk_debug_info = find_summary_candidate_pipe(
        dict_info, get_chunks_and_sents_and_refs_from_self_statement, 
        SELF_STATEMENT_SUMMARY_WEIGHT, '自傳', debug=DEBUG
    )

    candidate_sents_info_buffer[idx] = merge_candidate_sents_info(candidate_sents_info_buffer[idx], candidate_sents_info)
    chunk_debug_info_buffer[idx] = merge_chunk_debug_info(chunk_debug_info_buffer[idx], chunk_debug_info)

#     IO.print_dividing_line()

### Find candidates from recommendation letter

In [None]:
def get_chunks_and_sents_and_refs_from_recommendation_letter(_year, _id):
    rows = df_recommendation_letters.query('`year` == {} and `id` == {}'.format(_year, _id))
    
    try:
        rls_chunks = rows['all_paragraph_chunk'].to_list()
        rls_sents = rows['all_paragraph_sent'].to_list()
        rls_info = rows['info'].to_list()
    except:
        rls_chunks = []
        rls_sents = []
        rls_info = []
        
    if rls_chunks == None:
        rls_chunks = []
    if rls_sents == None:
        rls_sents = []
    if rls_info == None:
        rls_info = []
        
        
    chunks = []
    sents = []
    ref_dict = defaultdict(str)
    ## concat several recommendation letter into one document
    ## however, it is possible to process the recommendation letter individually
    for rl_chunks, rl_sents, rl_info in zip(rls_chunks, rls_sents, rls_info):
        for chunk in rl_chunks:
            ## [TODO] replace with info string from the dataframe after improving preprocess
            ref_dict[("chunk", len(chunks))] = "，".join(rl_info) 
            chunks.append(chunk)
        for sent in rl_sents:
            ## [TODO] replace with info string from the dataframe after improving preprocess
            ref_dict[("sent", len(sents))] = "，".join(rl_info)
            sents.append(sent)
            
    return chunks, sents, ref_dict

In [None]:
# %%time
## claim score + future plan score

IO.print_dividing_line()
IO.print_dividing_line("Processing recommendation letter ...")

for dict_info in tqdm(tuples):
    idx = dict_info_to_tuple_info(dict_info)
    candidate_sents_info, chunk_debug_info = find_summary_candidate_pipe(
        dict_info, get_chunks_and_sents_and_refs_from_recommendation_letter, 
        RECOMMENDATION_LETTER_SUMMARY_WEIGHT, '推薦信', debug=DEBUG
    )

    candidate_sents_info_buffer[idx] = merge_candidate_sents_info(candidate_sents_info_buffer[idx], candidate_sents_info)
    chunk_debug_info_buffer[idx] = merge_chunk_debug_info(chunk_debug_info_buffer[idx], chunk_debug_info)

#     IO.print_dividing_line()

## Generate pseudo summary

In [None]:
## [TODO] top-k sentence selection for each perspective

In [None]:
import pickle

In [None]:
## save all data
fn = "112_experiment_all_data.pkl"
_dir = os.path.join(P.FP_SIGNIFICANCE_PSEUDO_SUMMARY_DIR, 'custom_bertopic', TRAIN_OR_ALL, 'all_data')

if not os.path.exists(_dir):
    os.makedirs(_dir)

all_data_fp = os.path.join(_dir, fn)

with open(all_data_fp, "wb") as f:
    pickle.dump({
        "candidate_sents_info_buffer": candidate_sents_info_buffer,
        "chunk_debug_info_buffer": chunk_debug_info_buffer,
    }, f)

In [None]:
print("Finish generating pseudo summary from {} to {}".format(START_IDX, END_IDX))