# README

### Purpose of this notebook
- Use Google vision API (OCR) to detect the text content in the application in pdf format.

### Steps
0. Setup the environment for gcp and create a gcp project (not done in this notebook).
1. Create a bucket inside the project to store application files.
2. Upload the files to the cloud and check the result.
3. Use google vision API to detect the text inside the application.
4. Download the raw detection results (in a batch of json files) to local machine.
5. Post-process the detection results and get pure text for each application.

For preprocessing the text, go to `application_preprocess` notebook.

In [None]:
# gcp
from google.cloud import storage
from google.cloud import vision

import os
import json
import re
from importlib import reload
import time
import pickle
import pandas as pd
from tqdm import tqdm

# Utility variable
import sys
sys.path.insert(0, '../..')

# var
import var.var as V
import var.path as P

# utils
import utils.data as D
import utils.io as IO

In [None]:
PROJECT_ID = os.environ.get('GOOGLE_RESEARCH_PROJECT_ID')

## Upload files to bucket

### Create new bucket

In [None]:
RESEARCH_BUCKET_NAME = 'nthu-idea-lab-jason-research'
storage_client = storage.Client()

In [None]:
def create_bucket_class_location(bucket_name, storage_class="COLDLINE", location="ASIA-EAST1"):
    """
    Create a new bucket in the US region with the coldline storage
    class
    """
    # bucket_name = "your-new-bucket-name"

#     storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)
    bucket.storage_class = storage_class
    new_bucket = storage_client.create_bucket(bucket, location=location)

    print(
        "Created bucket {} in {} with storage class {}".format(
            new_bucket.name, new_bucket.location, new_bucket.storage_class
        )
    )
    return new_bucket

In [None]:
create_bucket_class_location(RESEARCH_BUCKET_NAME)

### List buckets

In [None]:
def list_buckets():
    """Lists all buckets."""

#     storage_client = storage.Client()
    buckets = storage_client.list_buckets()

    for bucket in buckets:
        print(bucket.name)

In [None]:
list_buckets()

### List file in buckets

In [None]:
def list_blobs(bucket_name):
    """Lists all the blobs in the bucket."""
    # bucket_name = "your-bucket-name"

#     storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name)

    for blob in blobs:
        print(blob.name)

In [None]:
# list_blobs(RESEARCH_BUCKET_NAME)

In [None]:
def list_blobs_with_prefix(bucket_name, prefix, delimiter=None, _print=True, _return=True):
    """Lists all the blobs in the bucket that begin with the prefix.

    This can be used to list all blobs in a "folder", e.g. "public/".

    The delimiter argument can be used to restrict the results to only the
    "files" in the given "folder". Without the delimiter, the entire tree under
    the prefix is returned. For example, given these blobs:

        a/1.txt
        a/b/2.txt

    If you specify prefix ='a/', without a delimiter, you'll get back:

        a/1.txt
        a/b/2.txt

    However, if you specify prefix='a/' and delimiter='/', you'll get back
    only the file directly under 'a/':

        a/1.txt

    As part of the response, you'll also get back a blobs.prefixes entity
    that lists the "subfolders" under `a/`:

        a/b/
    """

#     storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name, prefix=prefix, delimiter=delimiter)

    blob_names = []
    for blob in blobs:
        blob_names.append(blob.name)
    
    if _print:
        print("Blobs:")
        for blob in blob_names:
            print(blob)

        if delimiter:
            print("Prefixes:")
            for prefix in blobs.prefixes:
                print(prefix)
    
    if _return:
        return blob_names

In [None]:
# list_blobs_with_prefix(
#     bucket_name=RESEARCH_BUCKET_NAME, 
#     prefix='', 
#     delimiter='/'
# )

### Upload file to bucket

In [None]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    # The path to your file to upload
    # source_file_name = "local/path/to/file"
    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"

#     storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(
        "File {} uploaded to {}.".format(
            source_file_name, destination_blob_name
        )
    )

In [None]:
P.FP_FULL_APPLICATIONS_PDF_DIR

In [None]:
## only process applications from year 110 & 111 first
for _dir in P.FP_FULL_APPLICATIONS_PDF_DIR:
    print("local directory: {}".format(_dir))
    
    gcp_dir = '/'.join(_dir.split('/')[2:])
    print("gcp bucket: {}".format(gcp_dir))
    
    if '112' not in _dir:
        continue
    
    for _file in tqdm(os.listdir(_dir)):
        source_file_name = os.path.join(_dir, _file)
#         print(source_file_name)
        ## gcp Cloud Storage operates with a flat namespace 
        ## ,which means that folders don't actually exist within Cloud Storage.
        destination_blob_name = os.path.join(gcp_dir, _file)
        print(destination_blob_name)
        
        upload_blob(
            bucket_name = RESEARCH_BUCKET_NAME, 
            source_file_name = source_file_name, 
            destination_blob_name = destination_blob_name
        )
        
    IO.print_dividing_line()

In [None]:
def get_gcs_source_uri(_year, _id, bucket_name=RESEARCH_BUCKET_NAME):
    return 'gs://{}/data/applications/full_application/{}/pdf/{}.pdf'.format(
        bucket_name, _year, _id
    )

def get_gcs_source_uri_prefix(_year):
    return 'data/applications/full_application/{}/pdf/'.format(_year)
    
def get_gcs_destination_uri(_year, _id=None, bucket_name=RESEARCH_BUCKET_NAME):
    if _id:
        return 'gs://{}/data/applications/full_application/{}/txt_ocr_raw/{}/'.format(
            bucket_name, _year, _id
        )
    else:
        return 'gs://{}/data/applications/full_application/{}/txt_ocr_raw/'.format(
            bucket_name, _year
        )

def get_gcs_destination_uri_prefix(_year, _id=None):
    if _id:
        return 'data/applications/full_application/{}/txt_ocr_raw/{}/'.format(_year, _id)
    else:
        return 'data/applications/full_application/{}/txt_ocr_raw/'.format(_year)

In [None]:
# list_blobs(RESEARCH_BUCKET_NAME)

In [None]:
get_gcs_source_uri_prefix(112)

In [None]:
list_blobs_with_prefix(
    bucket_name=RESEARCH_BUCKET_NAME, 
    prefix=get_gcs_source_uri_prefix(112), 
    delimiter='',
    _print=True,
    _return=False
)

## Detect text in pdf files

In [None]:
def async_detect_document(gcs_source_uri, gcs_destination_uri, _print=False):
    if _print:
        start_time = time.time()
        print('Source: {}, waiting for the operation to finish.'.format(gcs_source_uri))
    
    """OCR with PDF/TIFF as source files on GCS"""
    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 3

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])
    
    ## wait for the operation to complete
    response = operation.result()
#     gcs_output_uri = response.output_config.gcs_destination.uri
    if _print:
        print("Output written to GCS: {}".format(gcs_destination_uri))

        end_time = time.time()
        print("Execute time: {:.2f} sec".format(end_time - start_time))
    
    return response

### Convert test files

In [None]:
# _id_test_list = [
    "# The content is removed due to confidential concerns."
# ]

In [None]:
# for _year, _id in tqdm(_id_test_list):
#     gcs_source_uri = get_gcs_source_uri(_year, _id)
#     gcs_destination_uri = get_gcs_destination_uri(_year, _id)

#     print(gcs_source_uri)
#     print(gcs_destination_uri)
#     res = async_detect_document(gcs_source_uri, gcs_destination_uri)

In [None]:
# list_blobs_with_prefix(
#     bucket_name=RESEARCH_BUCKET_NAME, 
#     prefix=get_gcs_source_uri_prefix(106), 
#     delimiter='',
#     _print=True,
#     _return=False
# )

## Convert all pdf files with OCR

In [None]:
for _year in range(112, 113):
    gcs_source_uri_prefix = get_gcs_source_uri_prefix(_year)

    blobs = list_blobs_with_prefix(
        bucket_name=RESEARCH_BUCKET_NAME, 
        prefix=gcs_source_uri_prefix, 
        delimiter='',
        _print=False,
        _return=True
    )
    
    print("Year: {}".format(_year))
    
    for blob in tqdm(blobs):
        _id = blob.split('/')[-1][:-4] ## remove file extension and get id
        
        gcs_source_uri = get_gcs_source_uri(_year, _id)
        gcs_destination_uri = get_gcs_destination_uri(_year, _id)
        
        res = list_blobs_with_prefix(
            bucket_name=RESEARCH_BUCKET_NAME, 
            prefix=get_gcs_destination_uri_prefix(_year, _id), 
            delimiter='',
            _print=False,
            _return=True
        )
        
        ## no need to convert if the file is already converted
        if len(res) > 0:
#             print(_id)
            continue
        
#         print(gcs_source_uri)
#         print(gcs_destination_uri)
        
        _ = async_detect_document(gcs_source_uri, gcs_destination_uri)
        
    IO.print_dividing_line()

## List converted files

In [None]:
list_blobs_with_prefix(
    bucket_name=RESEARCH_BUCKET_NAME, 
    prefix=get_gcs_destination_uri_prefix(112), 
    delimiter='',
    _print=True,
    _return=False
)

In [None]:
list_blobs_with_prefix(
    bucket_name=RESEARCH_BUCKET_NAME, 
    prefix=get_gcs_destination_uri_prefix(112), 
    delimiter='',
    _print=True,
    _return=False
)

### Download the detect results

In [None]:
def download_blob(bucket_name, source_blob_name, destination_file_name, _print=False):
    """Downloads a blob from the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The ID of your GCS object
    # source_blob_name = "storage-object-name"

    # The path to which the file should be downloaded
    # destination_file_name = "local/path/to/file"

#     storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Google Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

    if _print:
        print(
            "Downloaded storage object {} from bucket {} to local file {}.".format(
                source_blob_name, bucket_name, destination_file_name
            )
        )

In [None]:
for _year in range(112, 113):
    print("Year: {}".format(_year))
    
    gcs_destination_uri_prefix = get_gcs_destination_uri_prefix(_year)
#     print("Bucket source prefix: {}".format(gcs_destination_uri_prefix))
    
    blobs = list_blobs_with_prefix(
        bucket_name=RESEARCH_BUCKET_NAME, 
        prefix=gcs_destination_uri_prefix, 
        delimiter='',
        _print=False,
        _return=True
    )
    
#     print(len(blobs))
    
    for blob in tqdm(blobs):
        local_dir_prefix = '/'.join(blob.split('/')[:-1])
        local_dir = os.path.join('../..', local_dir_prefix)
        local_file_name = os.path.join('../..', blob)
        
        try:
            os.makedirs(local_dir)
        except:
            pass
        
        download_blob(
            bucket_name = RESEARCH_BUCKET_NAME, 
            source_blob_name = blob,
            destination_file_name = local_file_name
        )
    
    IO.print_dividing_line()

## Post process detect results
- organize the split responses into one single file

In [None]:
P.FP_FULL_APPLICATIONS_TXT_OCR_DIR

In [None]:
P.YEAR_DIRS

In [None]:
P.FP_FULL_APPLICATIONS_TXT_OCR_RAW_DIR

In [None]:
P.FP_FULL_APPLICATIONS_TXT_OCR_DIR

In [None]:
for year, year_txt_ocr_raw_dir, year_txt_ocr_dir in zip(
    P.YEAR_DIRS, P.FP_FULL_APPLICATIONS_TXT_OCR_RAW_DIR, P.FP_FULL_APPLICATIONS_TXT_OCR_DIR):
    
    if year != '112':
        continue
        
    try:
        os.makedirs(year_txt_ocr_raw_dir)
    except:
        pass
    
    try:
        os.makedirs(year_txt_ocr_dir)
    except:
        pass
    
    print("Year: {}".format(year))
    
    _ids = os.listdir(year_txt_ocr_raw_dir)
    _ids = [f for f in _ids if 'ipynb_checkpoints' not in f]
    print(_ids)

    for _id in tqdm(_ids):
#         print(_id)
        _dir = os.path.join(year_txt_ocr_raw_dir, _id)
    
        files = os.listdir(_dir)
        files = [f for f in files if 'output' in f]
        files = sorted(files, key=lambda f: int(f.split('-')[1]))
#         print(files)

        app_texts = []
        
        for file in files:
            rfp = os.path.join(_dir, file)
#             print(rfp)
            
            with open(rfp, 'r') as rf:
                res = json.load(rf)
#                 print(fp)
                for page in res['responses']:
                    try:
                        page_text = page['fullTextAnnotation']['text']
                    except:
                        page_text = ""
                        
                    app_texts.append(page_text)
        
        wfp = os.path.join(year_txt_ocr_dir, "{}.json".format(_id))
        with open(wfp, 'w') as wf:
            ## write page texts to file
            json.dump(app_texts, wf)
#             print(app_texts)

## Check the detect results

In [None]:
for year, year_txt_ocr_dir in zip(P.YEAR_DIRS, P.FP_FULL_APPLICATIONS_TXT_OCR_DIR):
    if year != '112':
        continue
    
    print(year)
    
    for app in os.listdir(year_txt_ocr_dir):
        if ".json" not in app:
            continue
        
        print(year, app)
        fp = os.path.join(year_txt_ocr_dir, app)
            
        with open(fp, 'r') as f:
            app_texts = json.load(f)
            print("Number of pages: {}".format(len(app_texts)))
            
            for pn, page_text in enumerate(app_texts, 1):
                IO.print_dividing_line("Page {}".format(pn))
                print(page_text)
    
        IO.print_dividing_line()
    
    IO.print_dividing_line()

# =========================================
# Danger Zone
# =========================================

## Rename File

In [None]:
def rename_blob(bucket_name, blob_name, new_name):
    """Renames a blob."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    # The ID of the GCS object to rename
    # blob_name = "your-object-name"
    # The new ID of the GCS object
    # new_name = "new-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)

    new_blob = bucket.rename_blob(blob, new_name)

    print("Blob {} has been renamed to {}".format(blob.name, new_blob.name))

### DELETE OBJECT

In [None]:
def delete_blob(bucket_name, blob_name):
    """Deletes a blob from the bucket."""
    # bucket_name = "your-bucket-name"
    # blob_name = "your-object-name"

    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.delete()

    print("Blob {} deleted.".format(blob_name))

### DELETE BUCKET

In [None]:
def delete_bucket(bucket_name):
    """Deletes a bucket. The bucket must be empty."""
    # bucket_name = "your-bucket-name"

#     storage_client = storage.Client()

    bucket = storage_client.get_bucket(bucket_name)
    bucket.delete()

    print("Bucket {} deleted".format(bucket.name))