# README

### Purpose of this notebook
- Create the application dataframe.
- Preprocess application.

### Steps

#### Create the application dataframe
1. Read the application text
2. Find the width and height of each application page

#### Preprocess application
1. Split application into multi-document (currently year 111 only)
2. Extract self-statement from application
3. Preprocess self-statement

## Application Preprocess

In [None]:
import pandas as pd
import os
import re
import json
import string
from collections import Counter
from itertools import chain
from importlib import reload

from tqdm import tqdm
tqdm.pandas(desc="progress: ")

# Chinese character set
from zhon import hanzi
import opencc

# Utility variable
import sys
sys.path.insert(0, '../..')

# var
import var.var as V
import var.path as P

# utils
import utils.data as D
import utils.io as IO
import utils.get_path as GP
import utils.preprocess as PP

In [None]:
MIN_CHUNK_LEN = 6
MIN_ZH_SENT_LEN = 10
MIN_EN_SENT_LEN = 6

In [None]:
cc = opencc.OpenCC('s2tw')

In [None]:
import pysbd
zh_sent_segmenter = pysbd.Segmenter(language="zh")
en_sent_segmenter = pysbd.Segmenter(language="en")

## Read Data from DataFrame

In [None]:
df_applications = D.read_df_applications()
df_applications.head()

In [None]:
df_applications.shape

## Extract self-statement

In [None]:
self_statement_keyword = "# The content is removed due to confidential concerns."
self_statement_keyword_list = ["# The content is removed due to confidential concerns."]
study_plan_keyword_list = ["# The content is removed due to confidential concerns."]

In [None]:
def extract_self_statement(row):
    boundaries = row['boundaries']
    app = row['application_pages']
    
    cover_pages = [app[pn] for pn in boundaries]
    
    self_statement_pn = -1
    idx = -1
    
    ## for self-statement after year 111 (inclusive)
    for i, (pn, page) in enumerate(zip(boundaries, cover_pages)):
        if self_statement_keyword in page:
            self_statement_pn = pn
            idx = i + 1
            break
    
    if self_statement_pn != -1:
        ## for self-statement after year 111 (inclusive)
        next_pn = boundaries[idx]
        
        start_page = self_statement_pn+1
        end_page = next_pn
        
        ss_pages = []
        sp_pages = []
    else:
        ## for self-statement before year 110 (inclusive)
        ss_pages = [
            pn+4 for pn, page in enumerate(app[4:]) if sum([
                True for kw in self_statement_keyword_list if kw in page.lower().replace(' ', '')
            ])
        ]
        sp_pages = [
            pn+4 for pn, page in enumerate(app[4:]) if sum([
                True for kw in study_plan_keyword_list if kw in page.lower().replace(' ', '')
            ])
        ]

        if ss_pages == [] and sp_pages == []:
            start_page = 5
            end_page = 10
        elif ss_pages == []:
            end_page = sp_pages[-1] + 4
            start_page = max(5, end_page - 10)
        elif sp_pages == []:
            start_page = ss_pages[0]
            end_page = start_page+10
        else:
            start_page = ss_pages[0]
            end_page = min(start_page + 10, sp_pages[-1] + 4)
            
    return app[start_page:end_page]

In [None]:
df_applications['self_statement'] = df_applications.progress_apply(
    extract_self_statement, axis=1
)

In [None]:
df_applications.reset_index(drop=True, inplace=True)

## Preprocess self-statement

### Self-statement cleaning and sentence segmentation

In [None]:
def zh_self_statement_cleaning_and_sentence_spliting(ss):
    if not ss:
        return []

    ## Convert Simplied Chinese to Traditional Chinese
    ss = cc.convert(ss)
    ## Remove > symbol
    ss = ss.replace('>', '')
    
    ## replace english comma surrounded by Chinese characters with Chinese comma
    re_ch_p = '[{}]'.format(hanzi.characters + hanzi.punctuation)
    p = "(?<={}),|,(?={})".format(re_ch_p, re_ch_p)
    ss = re.sub(p, '，', ss)
    ## replace english semicolon surrounded by Chinese characters with Chinese comma
    re_ch_p = '[{}]'.format(hanzi.characters + hanzi.punctuation)
    p = "(?<={});|;(?={})".format(re_ch_p, re_ch_p)
    ss = re.sub(p, '；', ss)
    ## replace english exclamation mark surrounded by Chinese characters with Chinese exclamation mark
    re_ch_p = '[{}]'.format(hanzi.characters + hanzi.punctuation)
    p = "(?<={})!|!(?={})".format(re_ch_p, re_ch_p)
    ss = re.sub(p, '！', ss)
    ## replace english period surrounded by Chinese characters with Chinese period
    p = "(?<={})\.(?=\D)|(?<=\D)\.(?={})".format(re_ch_p, re_ch_p)
    ss = re.sub(p, '。', ss)
    
    ## split whitespace between chinese character (except for english ssence)
    ss = ''.join(PP.split_whitespace_btn_ch_character(ss))
    ## segment sentence by pybsd library
    ss_sent = zh_sent_segmenter.segment(ss)
    ## segment number bullet point
    re_split_num_bullet = "((?<!\d)\d+\.(?!\d))"
    ss_sent = list(chain.from_iterable([re.split(re_split_num_bullet, _s) for _s in ss_sent]))
    ## segment chinese number bullet point
    re_split_ch_num_bullet = "([{}]+、)".format(PP.CH_NUMBER)
    ss_sent = list(chain.from_iterable([re.split(re_split_ch_num_bullet, _s) for _s in ss_sent]))
    ## segment chinese number bullet point
    re_split_ch_num_bullet = "(\([{}]+\))".format(PP.CH_NUMBER)
    ss_sent = list(chain.from_iterable([re.split(re_split_ch_num_bullet, _s) for _s in ss_sent]))
    ## segment bullet point
    re_split_bullet = "([{}])".format(PP.BULLET_POINT)
    ss_sent = list(chain.from_iterable([re.split(re_split_bullet, _s) for _s in ss_sent]))
    
    ## remove preceeding or trailing whitespace
    ss_sent = [_s.strip() for _s in ss_sent]
    
    ## remove duplicate sentences
    ss_sent_buf = []
    for _s in ss_sent:
        if _s not in ss_sent_buf:
            ss_sent_buf.append(_s)
    ss_sent = ss_sent_buf
    
    ## remove sentences that is a substring of another sentences
    ss_sent_buf = []
    for _s in ss_sent:
        is_substring = False
        for _ss in ss_sent:
            if _s == _ss:
                continue    
            if _s in _ss:
                is_substring = True
        if not is_substring:
            ss_sent_buf.append(_s)
    ss_sent = ss_sent_buf
    
    ## hope to remove title or heading
    ss_sent = [_s.strip() for _s in ss_sent if PP.get_sent_len(_s) > MIN_ZH_SENT_LEN]
    
    return ss_sent

In [None]:
def en_self_statement_cleaning_and_sentence_spliting(ss):
    if not ss:
        return []

    ss = ss.replace('\n', ' ')
    
    ss_sent = en_sent_segmenter.segment(ss)
    ss_sent = list(chain.from_iterable([en_sent_segmenter.segment(_s) for _s in ss_sent]))
    
    ## segment bullet point
    re_split_bullet = "([{}])".format(PP.BULLET_POINT)
    ss_sent = list(chain.from_iterable([re.split(re_split_bullet, _s) for _s in ss_sent]))

    ## remove duplicate sentences
    ss_sent_buf = []
    for _s in ss_sent:
        if _s not in ss_sent_buf:
            ss_sent_buf.append(_s)
    ss_sent = ss_sent_buf
    
    ## remove sentences that is a substring of another sentences
    ss_sent_buf = []
    for _s in ss_sent:
        is_substring = False
        for _ss in ss_sent:
            if _s == _ss:
                continue    
            if _s in _ss:
                is_substring = True
        if not is_substring:
            ss_sent_buf.append(_s)
    ss_sent = ss_sent_buf
    
    ## hope to remove title or heading
    ss_sent = [_s.strip() for _s in ss_sent if PP.get_sent_len(_s) > MIN_EN_SENT_LEN]
    
    return ss_sent

In [None]:
def self_statement_cleaning_and_sentence_spliting(ss_pages):
    if not ss_pages:
        return []
    
    ss = ''.join(ss_pages)
    
    ## check the language of the document
    zh_char_count = sum([1 for ch in ss if PP.is_zh_character(ch)])
    zh_char_rate = zh_char_count / len(ss)
    
    if zh_char_rate < 0.1: ## english document preprocess
        ss_sent = en_self_statement_cleaning_and_sentence_spliting(ss)
    else: ## chinese document preprocess
        ss_sent = zh_self_statement_cleaning_and_sentence_spliting(ss)

    return ss_sent

In [None]:
df_applications['self_statement_sent'] = df_applications['self_statement'].progress_apply(
    self_statement_cleaning_and_sentence_spliting
)

### Transformer as language model
- Remove noise sentence by perplexity score generated by transformer (encoder-decoder model)
    - the perplexity for generating the sentence itself
- Source: https://gist.github.com/yuchenlin/eb63e2d0513f70cfc9bb85fa5a78953b

In [None]:
import torch
from torch import Tensor

In [None]:
GPU_NUM = 0
device = torch.device(GPU_NUM)

PERPLEXITY_THRESHOLD = 5

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

In [None]:
mbart_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50")
mbart_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50").to(device)

In [None]:
def noise_sentence_removal(sents, tokenizer, model):
    ## Can not process in batch because the loss would be merged together
    if not sents:
        return []
    
    normal_sent = []
    
    for sent in sents:
        input_ids = torch.tensor(
            tokenizer.encode(sent, truncation=True, max_length=1020)
        ).unsqueeze(0)  # Batch size 1
        input_ids = input_ids.to(device)

        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)

        loss = outputs[0]
        sentence_ppl = loss.item()
        
        if sentence_ppl < PERPLEXITY_THRESHOLD:
            normal_sent.append(sent)
    
    return normal_sent

In [None]:
self_statement_sent_noise_removed = []

for _, row in tqdm(df_applications.iterrows(), total=df_applications.shape[0]):
    _year = row['year']
    ss_sent = row['self_statement_sent']
    
    ss_sent = noise_sentence_removal(ss_sent, mbart_tokenizer, mbart_model)
    self_statement_sent_noise_removed.append(ss_sent)

In [None]:
df_applications['self_statement_sent'] = self_statement_sent_noise_removed

### Generate chunks for aligning with comment clustering model

In [None]:
re_split_chunks = "[！？｡。，；,!;?\n]|(?<=\D)\.(?= )"

In [None]:
def self_statement_chunk_spliting(ss_sent):
    if not ss_sent:
        return None
    
    ss = '。'.join(ss_sent)
#     s = cc.convert(s)
    ss = ss.replace('>', '')
    
    ## Add 。 spliter before number bullet
    p = '((?<!\d)\d+\.(?!\d)|[★●◆➢])'
    ss = re.sub(p, r'。', ss)
    
    ## split ss_chunkence with punctuation
    punc_list = re.findall(re_split_chunks, ss.strip())
    ss_chunk = re.split(re_split_chunks, ss.strip())
    
    ## combine split sentence with punctuation
    buf = []
    for i in range(len(ss_chunk)):
        try:
            buf.append(ss_chunk[i] + punc_list[i])
        except:
            ## end of the list
            buf.append(ss_chunk[i])
    ss_chunk = buf
    
    ## split whitespace between chinese character (except for english sentence)
    ss_chunk = list(chain.from_iterable([PP.split_whitespace_btn_ch_character(_s) for _s in ss_chunk]))
    ## remove empty string
    ss_chunk = [_s for _s in ss_chunk if not PP.is_empty_sent(_s)]
    ## remove preceeding punctuation
    ss_chunk = [_s if _s[0] not in (hanzi.non_stops + "。" + V.EN_PUNC_NON_STOPS) else _s[1:] for _s in ss_chunk]
    ss_chunk = [_s.strip() for _s in ss_chunk]
    ss_chunk = [_s for _s in ss_chunk if not PP.is_empty_sent(_s)]
    ## remove trailing punctuation if it is none stop punctuation
    ss_chunk = [_s if _s[-1] not in (hanzi.non_stops + "。.;" + V.EN_PUNC_NON_STOPS) else _s[:-1] for _s in ss_chunk]
    ss_chunk = [_s.strip() for _s in ss_chunk]
    ss_chunk = [_s for _s in ss_chunk if not PP.is_empty_sent(_s)]
    ## remove too small chunks
    ss_chunk = [_s for _s in ss_chunk if PP.get_sent_len(_s) > MIN_CHUNK_LEN]
    
    return ss_chunk

In [None]:
df_applications['self_statement_chunk'] = df_applications['self_statement_sent'].progress_apply(
    self_statement_chunk_spliting
)

In [None]:
df_applications['self_statement_sent'] = df_applications['self_statement_sent'].apply(lambda l: l if l else [])

In [None]:
df_applications['self_statement_chunk'] = df_applications['self_statement_chunk'].apply(lambda l: l if l else [])

In [None]:
for col in df_applications.columns:
    print(df_applications[col].isna().value_counts())

In [None]:
df_applications.tail()

## Save the results

In [None]:
D.write_df_applications(df_applications, file='csv')
D.write_df_applications(df_applications, file='pkl')