# README

### Purpose of this notebook
- Create the application dataframe.
- Preprocess application.

### Steps

#### Create the application dataframe
1. Read the application text
2. Find the width and height of each application page

#### Preprocess application
1. Split application into multi-document (currently year 111 only)
2. Extract self-statement from application
3. Preprocess self-statement

## Application Preprocess

In [None]:
import pandas as pd
import os
import re
import json
import string
from collections import Counter
from itertools import chain
from importlib import reload

from tqdm import tqdm
tqdm.pandas(desc="progress: ")

# Chinese character set
from zhon import hanzi
import opencc

# Utility variable
import sys
sys.path.insert(0, '../..')

# var
import var.var as V
import var.path as P

# utils
import utils.data as D
import utils.io as IO
import utils.get_path as GP
import utils.preprocess as PP

In [None]:
cc = opencc.OpenCC('s2tw')

## Create application dataframe

In [None]:
P.YEAR_DIRS, P.FP_FULL_APPLICATIONS_TXT_OCR_DIR

### Read the text

In [None]:
df_applications_old = D.read_df_applications()

In [None]:
df_applications_data = []

for year, year_txt_ocr_dir in zip(P.YEAR_DIRS, P.FP_FULL_APPLICATIONS_TXT_OCR_DIR):
    if year != '112':
        continue
    
    for app in os.listdir(year_txt_ocr_dir):
        if ".json" not in app:
            continue
            
        _id = app.split('.')[0]
        
        fp = os.path.join(year_txt_ocr_dir, app)
            
        with open(fp, 'r') as f:
            app_texts = json.load(f)
    
        _year = int(year)
        _id = int(_id)
        
        if len(df_applications_old.query("`year` == @_year and `id` == @_id").index) == 0:
            row_data = {
                'year': int(year),
                'id': int(_id),
                'application_pages': app_texts,
                'num_pages': len(app_texts)
            }

            df_applications_data.append(row_data)

df_applications = pd.DataFrame(df_applications_data)

In [None]:
df_applications.sort_values(['year', 'id'], inplace=True)
df_applications.reset_index(drop=True, inplace=True)

In [None]:
df_applications.head()

In [None]:
df_applications.shape

## Find the width and height of each application page

In [None]:
P.FP_FULL_APPLICATIONS_TXT_OCR_DIR

In [None]:
def recursive_items(dictionary, level):
    for key, value in dictionary.items():
        if type(value) is dict:
            yield key, level
            yield from recursive_items(value, level+1)
        else:
            yield key, level

In [None]:
page_width_height_data = []

for _, row in tqdm(df_applications.iterrows(), total=df_applications.shape[0]):
    _year = str(row['year'])
    _id = str(row['id'])

#     print(_year, _id)
    _dir = GP.get_application_page_raw_ocr_dir(_year, _id)

    files = os.listdir(_dir)
    files = [f for f in files if 'output' in f]
    files = sorted(files, key=lambda f: int(f.split('-')[1]))
#     print(files)

    page_width_height = []
        
    for file in files:
        rfp = os.path.join(_dir, file)

        with open(rfp, 'r') as rf:
            res = json.load(rf)

#             for key, level in recursive_items(res, 0):
#                 print('{}{}'.format('--'*level, key))
                
            for page in res['responses']:
                try:
                    page_info = page['fullTextAnnotation']['pages'][0]
                    page_width = page_info['width']
                    page_height = page_info['height']
                except:
                    page_width = 0
                    page_height = 0
        
                page_width_height.append((page_width, page_height))          
    
    page_width_height_data.append(page_width_height)

In [None]:
df_applications['page_width_height'] = page_width_height_data

In [None]:
df_applications.head()

## Split applications into multi-document including:
- data sheet
- qualification, transcript, eligibility
- self-statement
- portfolio
- others
- recommendation letters

## Find obvious boundaries from application after year 111 (inclusive)

### Split the application

In [None]:
cover_keyword = "NATIONAL TSING HUA UNIVERSITY"

In [None]:
V.COVER_PAGE_TITLE_LIST

In [None]:
def split_application_year_111(row):
    year = row['year']
    app = row['application_pages']
    
    if year < 111:
        return []
    
    ## year 111
    cover_pn = [pn for pn, page in enumerate(app) if cover_keyword in page]
    ## filter false positive
    cover_pn = [pn for pn in cover_pn if 
                any(1 for keyword in V.COVER_PAGE_TITLE_LIST if keyword in app[pn])]
    ## filter false positive
    cover_pn = [pn for pn in cover_pn if len(app[pn]) < 100]
    ## filter duplicate
    for keyword in V.COVER_PAGE_TITLE_LIST:
        pns = [pn for pn in cover_pn if keyword in app[pn]]
        
        ## remove pn except for the first occurence
        for remove_pn in pns[1:]:
            cover_pn.remove(remove_pn)
    
    return cover_pn

In [None]:
df_applications['boundaries'] = df_applications.apply(split_application_year_111, axis=1)

### Validation

In [None]:
df_applications['boundaries'].apply(len).value_counts()

In [None]:
cn = Counter()

In [None]:
for _, row in df_applications.iterrows():
    boundaries = row['boundaries']
    app = row['application_pages']
    
    cover_pages = '\n'.join([app[pn] for pn in boundaries])
    titles = [keyword for keyword in V.COVER_PAGE_TITLE_LIST if keyword in cover_pages]
    titles = ' '.join(titles)
    cn[titles] += 1

In [None]:
cn

### Find page span for each section

In [None]:
def find_application_section_span_year_111(row):
    year = row['year']
    app = row['application_pages']
    boundaries = row['boundaries']
    
    if year < 111:
        return {}

    section_span = {}
    
    cover_pages = '\n'.join([app[pn] for pn in boundaries])
    titles = [keyword for keyword in V.COVER_PAGE_TITLE_LIST if keyword in cover_pages]
    assert len(boundaries) == len(titles)
    
    for i, title in enumerate(titles):
        try:
            section_span[title] = (boundaries[i], boundaries[i+1])
        except:
            section_span[title] = (boundaries[i], len(app))
        
    return section_span

In [None]:
df_applications['section_span'] = df_applications.apply(find_application_section_span_year_111, axis=1)

In [None]:
df_applications.head()

In [None]:
df_applications.tail()

#### Check the results

In [None]:
for _, row in df_applications.tail().iterrows():
    boundaries = row['boundaries']
    section_span = row['section_span']
    
    print(boundaries)
    print(section_span)
    IO.print_dividing_line()

## Save the results

In [None]:
D.write_df_applications(df_applications, file='csv')
D.write_df_applications(df_applications, file='pkl')