# README

### Purpose of this notebook
- Preprocess recommendation letters.

## Application Preprocess

In [None]:
import pandas as pd
import numpy as np
import os
import re
import string
from itertools import chain
from collections import defaultdict
from importlib import reload

from tqdm import tqdm
tqdm.pandas(desc="progress: ")

# Chinese character set
from zhon import hanzi
import opencc

# Utility variable
import sys
sys.path.insert(0, '../..')

# var
import var.var as V
import var.path as P

# utils
import utils.coverage as C
import utils.data as D
import utils.io as IO
import utils.get_path as GP
import utils.preprocess as PP

In [None]:
MIN_CHUNK_LEN = 6

In [None]:
cc = opencc.OpenCC('s2tw')

## CKIP tools

In [None]:
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker

In [None]:
ws_driver  = CkipWordSegmenter(device=0)
pos_driver = CkipPosTagger(device=0)
ner_driver = CkipNerChunker(device=0)

## Read Data from DataFrame

In [None]:
df_applications = D.read_df_applications()
df_applications.tail()

In [None]:
df_recommendation_letters = D.read_df_recommendation_letters()
df_recommendation_letters.head()

In [None]:
def recommendation_letters_paragraph_combine(row):
    row = row.dropna()
    row = [str(p) for p in row[3:9]]
    return '\n'.join(row)
    
df_recommendation_letters['all_paragraph_sent'] = \
    df_recommendation_letters.apply(recommendation_letters_paragraph_combine, axis=1)

In [None]:
df_recommendation_letters.head()

## Extract and map the information of reference for each individual recommendation letter

### Map the recommendation letter between dataframe and application page

In [None]:
import jieba

In [None]:
def extract_application_recommendation_letter_span(row, debug=False):
    try:
        rl_span_idx = row['section_span']['推薦信']
        rl_span = row['application_pages'][rl_span_idx[0]:rl_span_idx[1]]
    except:
        rl_span = []
    
    if debug:
        print(rl_span)
        pass

    return rl_span

In [None]:
def extract_application_recommendation_letters(row, debug=False):
    _year = row['year']
    
    rl_span = extract_application_recommendation_letter_span(row, debug)
    
    rl_fp_idx = []
    rl_title = "Letter of Reference"

    for pid, page in enumerate(rl_span):
        if rl_title in page:
            rl_fp_idx.append(pid)
            
    rls = []
    
    if _year == 111:
        for i in range(len(rl_fp_idx)):
            try:
                rls.append(rl_span[rl_fp_idx[i]:rl_fp_idx[i+1]])
            except:
                rls.append(rl_span[rl_fp_idx[i]:])
    elif _year >= 112:
        for i in range(len(rl_fp_idx)):
            try:
                rls.append(rl_span[rl_fp_idx[i]:rl_fp_idx[i+2]])
            except:
                rls.append(rl_span[rl_fp_idx[i]:])
                
    if debug:
        print(rl_fp_idx)
        pass
            
    return rls, rl_fp_idx

In [None]:
def map_recommendation_letters(row, debug=False):
    _year = row['year']
    _id = row['id']
    
    ## only process recommendation letter after year 111
    if _year < 111:
        return
    
    df_rls = df_recommendation_letters.query('`year` == {} and `id` == {}'.format(_year, _id))
    app_rls, app_rl_fp_idx = extract_application_recommendation_letters(row, debug)
    
    if debug:
        print(_year, _id)
        pass
    
    ## calculate the coverage score of the recommendation letters 
    ## between dataframe and application pages 
    map_result_buf = {}
    for df_rl_idx, df_rl in df_rls.iterrows():
        key = ''.join(df_rl.to_list()[3:9])
        key = jieba.lcut(key)
    
        rls_coverage_score = {}
        for app_rl_idx, app_rl in zip(app_rl_fp_idx, app_rls):
            if key == []:
                coverage = 0
            else:
                target = ''.join(app_rl)
                coverage = C.calculate_coverage(key, target)
            rls_coverage_score[app_rl_idx] = coverage
        
        map_result_buf[df_rl_idx] = rls_coverage_score
    
    df_map_result = pd.DataFrame(map_result_buf)
    
    if debug:
        print(df_map_result)
    
    ## find the mapped recommendation letters
    map_result = defaultdict(None)
    for app_rl_idx, row in df_map_result.iterrows():
        df_rl_idx_max = row.idxmax()
        map_result[df_rl_idx_max] = app_rl_idx
    ## deal with non-mapped outliers of recommendation letter from dataframe
    for df_rl_idx in df_map_result.columns:
        try:
            map_result[df_rl_idx]
        except:
            map_result[df_rl_idx] = None
    
    if debug:
        print(map_result)
        
    return map_result

In [None]:
%%time
map_app_df_rl_series = df_applications.apply(map_recommendation_letters, axis=1)

### Extract the information of reference for each individual recommendation letter

In [None]:
def extract_recommendation_letter_information(row, debug=False):
    _year = row['year']
    _id = row['id']
    
    if debug:
        print(_year, _id)

    rls, rl_fp_idx = extract_application_recommendation_letters(row, debug)
    
    if debug:
        rls, rl_fp_idx
    
    ## extract the information of the reference
    info_start = "服務機關"
    info_end = "推薦人填寫部份"
    
    info_list = {}
    
    ## remove unnecessary information
    for rl, idx in zip(rls, rl_fp_idx):
        page = ''.join(rl)

        info_si = page.find(info_start)
        info_ei = page.find(info_end)

        info = page[info_si:info_ei]

        remove_keywords = [
            '-', '推薦人', 'Information', 'lof', 'of', 'Reference', '姓名', 'Name',
            '服務機關', 'Institute', '職稱', 'Position', '電話', 'Phone', 'Number', 
            '電子郵件', 'mail Address', 'E',
            '\d{2,}',
            '[{}]'.format(string.punctuation),
        ]
        
        remove_keywords_sub = [
            '清華學院學士班甲組', '畢業', 'Background'
        ]

        infos = info.split('\n')

        for i in range(len(infos)):        
            for rkw in remove_keywords:
                if "、" in infos[i] or '@' in infos[i]:
                    infos[i] = ''

                infos[i] = re.sub(rkw, '', infos[i])
                
            for rkw in remove_keywords_sub:
                if rkw in infos[i]:
                    infos[i] = ''

        infos = [info.strip() for info in infos if info]
        infos = [info for info in infos if len(info) >= 2]
            
        ## for 112, find name of info
        if _year == 112:
            remain = rl[0].find("姓名", page.find("姓名")+2)
            name_candidates = rl[0][remain:].split('\n')
            name_candidates = [s.replace("姓名", '') for s in name_candidates]
            name_candidates = [s.replace("Name", '') for s in name_candidates]
            name_candidates = [re.sub('[{}]'.format(string.punctuation), '', s) for s in name_candidates]
            name_candidates = [s.strip() for s in name_candidates]
            name_candidates = [s for s in name_candidates if s]

            ## find name
            ws  = ws_driver(name_candidates, batch_size=1024, show_progress=False)
            pos = pos_driver(ws, batch_size=1024, show_progress=False)
            ner = ner_driver(name_candidates, batch_size=1024, show_progress=False)
            
            if debug:
                print(name_candidates)
                print("start to find name")
            
            name = ""
            for sentence, sentence_ws, sentence_pos, sentence_ner in zip(name_candidates, ws, pos, ner):
                if debug:
                    print(sentence)

                ## find NER with 'PERSON'
                only_contain_person = False

                ners = []
                for entity in sentence_ner:
                    ners.append(entity.ner)
                    if debug:
                        print(entity)

                if debug:
                    print(pack_ws_pos_sentece(sentence_ws, sentence_pos))
                    print()

                if ners != ['PERSON']:
                    continue

                ## check pos is 'Nb' or 'FW' + 'WHITESPACE'
                if set(sentence_pos) == {'Nb'} or set(sentence_pos) == {'FW', 'WHITESPACE'}:
                    name = sentence
                    if debug:
                        print("name:", name)
                    break
            
            if name != "":
                infos.insert(0, name)
    
        info_list[idx] = infos
    
        if debug:
            print("Final info:")
            for info in infos:
                print(info)
            print('--')
    
    if debug:
        IO.print_dividing_line()
        
    return info_list

In [None]:
app_rl_info_series = df_applications.progress_apply(extract_recommendation_letter_information, axis=1)

### Match the information of reference to the recommendation letter dataframe

In [None]:
map_app_df_rl_series

In [None]:
app_rl_info_series

In [None]:
mapped_info = defaultdict(list)

for mapping, info_dict in zip(map_app_df_rl_series, app_rl_info_series):
    if not mapping:
        continue
#     print(mapping, info_dict)
    
    for df_rl_idx, app_rl_idx in mapping.items():
        try:
            info = info_dict[app_rl_idx]
        except:
            info = []
        mapped_info[df_rl_idx] = info

In [None]:
mapped_info

In [None]:
list_infos = []

for idx, _ in df_recommendation_letters.iterrows():
    list_infos.append(mapped_info[idx])

In [None]:
df_recommendation_letters['info'] = list_infos

In [None]:
df_recommendation_letters['info'].value_counts()

In [None]:
df_recommendation_letters.head()

In [None]:
df_recommendation_letters.tail()

### test data

In [None]:
# test_tuple = [
#    "# The content is removed due to confidential concerns."
# ]

### Preprocess recommendation letters sentences

In [None]:
def recommendation_letter_preprocess(text):
    re_ch_p = '[{}]'.format(hanzi.characters + hanzi.punctuation)
    
    ## replace english comma surrounded by Chinese characters with Chinese comma
    p = "(?<={}),|,(?={})".format(re_ch_p, re_ch_p)
    text = re.sub(p, '，', text)
    ## replace english semicolon surrounded by Chinese characters with Chinese comma
    p = "(?<={});|;(?={})".format(re_ch_p, re_ch_p)
    text = re.sub(p, '；', text)
    ## replace english period surrounded by Chinese characters with Chinese period
    p = "(?<={})\.(?=\D)|(?<=\D)\.(?={})".format(re_ch_p, re_ch_p)
    text = re.sub(p, '。', text)
    ## replace '&amp;' with '&'
    text = re.sub('&amp;', '&', text)
            
    return text

In [None]:
# for _year, _id, _ in test_tuple:
#     row = df_recommendation_letters.query('`year` == {} and `id` == {}'.format(_year, _id))
#     text = row['all_paragraph_sent'].to_list()[1]
#     text = recommendation_letter_preprocess(text)
    
#     print(text)
        
#     IO.print_dividing_line()

In [None]:
df_recommendation_letters['all_paragraph_sent'] = df_recommendation_letters['all_paragraph_sent'].progress_apply(recommendation_letter_preprocess)

### Split into sentences with Chinese and english punctuation
- Can not use nltk to tokenize Chinese sentences

In [None]:
re_split_stop_punc = "([！？｡。；!;?])"
re_split_eng_period = "((?<!\d)\.)"
re_split_num_bullet = "((?<!\d)\d+\.(?!\d))"
re_split_ch_num_bullet = "([一二三四五六七八九十壹貳參肆伍陸柒捌玖拾]、)"
re_split_bullet = "([★●◆➢])"

In [None]:
def recommendation_letter_split_sentences(sent):
    if not sent:
        return []
    
    if type(sent) == str:
        sent = [sent]
    
    def split_paragraph(paragraph, p, punc_location):
        ## split sentence with punctuation
        punc_list = re.findall(p, paragraph.strip())
        buf_sent = re.split(p, paragraph.strip())
        
        ## combine split sentence with punctuation
        if punc_location == "back":
            p_sent = []
            i = 0
        elif punc_location == "front":
            try:
                p_sent = [buf_sent[0]]
            except:
                pass
            i = 1
            
        while i < len(buf_sent):
            try:
                p_sent.append(buf_sent[i] + buf_sent[i+1])
                i += 2
            except:
                ## end of the list
                p_sent.append(buf_sent[i])
                i += 1
        
        return p_sent
        
    ## split paragraph into sentences
    sent = list(chain.from_iterable([split_paragraph(_s, re_split_stop_punc, "back") for _s in sent]))
    sent = list(chain.from_iterable([split_paragraph(_s, re_split_eng_period, "back") for _s in sent]))
    sent = list(chain.from_iterable([split_paragraph(_s, re_split_bullet, "front") for _s in sent]))
    sent = list(chain.from_iterable([split_paragraph(_s, re_split_num_bullet, "front") for _s in sent]))
    sent = list(chain.from_iterable([split_paragraph(_s, re_split_ch_num_bullet, "front") for _s in sent]))
    ## remove empty string
    sent = [_s.strip() for _s in sent if not PP.is_empty_sent(_s)]
    sent = [_s.strip() for _s in sent if not PP.is_empty_sent(_s)]
    
    ## remove duplicate sentences
    sent_buf = []
    for _s in sent:
        if _s not in sent_buf:
            sent_buf.append(_s)
    sent = sent_buf
    
    ## remove sentences that is a substring of another sentences
    sent_buf = []
    for _s in sent:
        is_substring = False
        for _ss in sent:
            if _s == _ss:
                continue    
            if _s in _ss:
                is_substring = True
        if not is_substring:
            sent_buf.append(_s)
    sent = sent_buf

    return sent

In [None]:
# for _year, _id, _ in test_tuple:
#     row = df_recommendation_letters.query('`year` == {} and `id` == {}'.format(_year, _id))
#     text = row['all_paragraph_sent'].to_list()[1]
    
#     for sent in recommendation_letter_split_sentences(text):
#         print(sent)
        
#     IO.print_dividing_line()

In [None]:
df_recommendation_letters['all_paragraph_sent'] = df_recommendation_letters['all_paragraph_sent'].progress_apply(recommendation_letter_split_sentences)

### Generate chunks for aligning with comment clustering model

In [None]:
re_split_chunks = "[！？｡。，；,!;?\n]|(?<=\D)\.(?= )|[一二三四五六七八九十壹貳參肆伍陸柒捌玖拾]、"

In [None]:
def recommendation_letter_generate_chunks(sent):
    if not sent:
        return []
    
    s = '。'.join(sent)
    s = s.replace('>', '')
    
    ## Add 。 spliter before number bullet
    p = '((?<!\d)\d+\.(?!\d)|[★●◆➢]|[一二三四五六七八九十壹貳參肆伍陸柒捌玖拾]、)'
    s = re.sub(p, r'。', s)
    
    ## split sentence with punctuation
    punc_list = re.findall(re_split_chunks, s.strip())
    sent = re.split(re_split_chunks, s.strip())
    
    ## combine split sentence with punctuation
    buf = []
    for i in range(len(sent)):
        try:
            buf.append(sent[i] + punc_list[i])
        except:
            ## end of the list
            buf.append(sent[i])
    sent = buf
    
    ## split whitespace between chinese character (except for english sentence)
    sent = list(chain.from_iterable([PP.split_whitespace_btn_ch_character(_s) for _s in sent]))
    ## remove empty string
    sent = [_s for _s in sent if not PP.is_empty_sent(_s)]
    ## remove preceeding punctuation
    sent = [_s if _s[0] not in (hanzi.non_stops + "。" + V.EN_PUNC_NON_STOPS) else _s[1:] for _s in sent]
    sent = [_s.strip() for _s in sent]
    sent = [_s for _s in sent if not PP.is_empty_sent(_s)]
    ## remove trailing punctuation if it is none stop punctuation
    sent = [_s if _s[-1] not in (hanzi.non_stops + "。.;" + V.EN_PUNC_NON_STOPS) else _s[:-1] for _s in sent]
    sent = [_s.strip() for _s in sent]
    sent = [_s for _s in sent if not PP.is_empty_sent(_s)]
    ## remove too small chunks
    sent = [_s for _s in sent if PP.get_sent_len(_s) > MIN_CHUNK_LEN]
    
    return sent

In [None]:
# for _year, _id, _ in test_tuple:
#     row = df_recommendation_letters.query('`year` == {} and `id` == {}'.format(_year, _id))
#     sent = row['all_paragraph_sent'].to_list()[0]
    
#     for chunk in recommendation_letter_generate_chunks(sent):
#         print(chunk)
        
#     IO.print_dividing_line()

In [None]:
df_recommendation_letters['all_paragraph_chunk'] = df_recommendation_letters['all_paragraph_sent'].progress_apply(recommendation_letter_generate_chunks)

In [None]:
df_recommendation_letters['all_paragraph_sent'] = df_recommendation_letters['all_paragraph_sent'].apply(
    lambda sent: sent if sent else []
)

In [None]:
df_recommendation_letters['all_paragraph_chunk'] = df_recommendation_letters['all_paragraph_chunk'].apply(
    lambda chunk: chunk if chunk else []
)

In [None]:
df_recommendation_letters.tail()

## Save the results

In [None]:
D.write_df_recommendation_letters(df_recommendation_letters, file='csv')
D.write_df_recommendation_letters(df_recommendation_letters, file='pkl')