In [None]:
import matplotlib.pyplot as plt
import numpy as np
import PIL
from PIL import Image
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

import os
import pymupdf
import pandas as pd
import shutil

import PathStorage

In [None]:
model = keras.models.load_model("DocCNN.keras")
doc_data = pd.read_csv(PathStorage.docdata_sheet, index_col=0)

In [None]:
def split(array, start, end, left = 0, right = 1):
    start_pos = int(array.shape[0] * start)
    end_pos = int(array.shape[0] * end)
    left_pos = int(array.shape[1] * left)
    right_pos = int(array.shape[1] * right)
    return array[start_pos:end_pos,left_pos:right_pos,:]

In [None]:
shutil.rmtree(PathStorage.temp_prediction_png)
os.mkdir(PathStorage.temp_prediction_png)

for index, row in doc_data.iterrows():
    pdf_path = row.loc["Path"]
    doc = pymupdf.open(pdf_path)
    page = doc.load_page(0)
    pix = page.get_pixmap(dpi=20)

    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    
    img_path = os.path.join(PathStorage.temp_prediction_png, "{:05d}".format(index) + ".png")

    image.save(img_path)


In [None]:
data_dir = PathStorage.temp_prediction_png

batch_size = 32
img_height = 180
img_width = 180

prediction_set = tf.keras.preprocessing.image_dataset_from_directory(
  data_dir,
  labels=None,
  validation_split=None,
  image_size=(img_height, img_width),
  batch_size=batch_size,
  color_mode='rgb',
  shuffle=False)

print(prediction_set)

In [None]:
result = model.predict(prediction_set)



In [None]:
class_names=['Backpage', 'GC', 'Normal']

for i in range(0, 100):
    score = tf.nn.softmax(result[i])

    print(str(i) + " " + class_names[np.argmax(score)] + " " + str(np.max(score) * 100))

In [None]:

result_series = pd.Series(np.argmax(result, axis=1))
back_mask = result_series.where(result_series == 0)
back_mask = back_mask.replace(0, 1)
back_mask = back_mask.fillna(0)

gc_mask = result_series.mask(result_series == 2, 0)
gc_mask = gc_mask.replace(1, 4)

doc_data["Backpage"] = back_mask
doc_data["Action"] = gc_mask

doc_data.head(5)

In [None]:
num_mask = doc_data["Num Pages"] == 2
backpage_mask = doc_data["Backpage"] == 1

total_back = num_mask & backpage_mask
doc_data.loc[total_back, "Blank"] = 1

offset_backpage_mask = backpage_mask.shift(-1)
offset_backpage_mask.iloc[len(offset_backpage_mask) - 1] = False

total_front = num_mask & offset_backpage_mask
doc_data.loc[total_front, "Blank"] = 1

offset_backpage_mask


In [None]:
doc_data.to_csv(PathStorage.docdata_sheet)

# Second Level
## Form Processing

In [None]:
doc_data = pd.read_csv(PathStorage.docdata_sheet, index_col=0)
form_model = keras.models.load_model("DocCNNSecondLevel.keras")
form_model.get_config()


In [None]:
def split(array, start, end, left = 0, right = 1):
    start_pos = int(array.shape[0] * start)
    end_pos = int(array.shape[0] * end)
    left_pos = int(array.shape[1] * left)
    right_pos = int(array.shape[1] * right)
    return array[start_pos:end_pos,left_pos:right_pos,:]

In [None]:
form_mask = (doc_data["Action"] != 4) & (doc_data["Backpage"] == 0)
form_docs = doc_data[form_mask]
form_docs.tail(5)

In [None]:
shutil.rmtree(PathStorage.temp_prediction_png_second)
os.mkdir(PathStorage.temp_prediction_png_second)

for index, row in form_docs.iterrows():
    pdf_path = row.loc["Path"]
    doc = pymupdf.open(pdf_path)
    page = doc.load_page(0)
    pix = page.get_pixmap(dpi=150)

    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    array = np.asarray(image, dtype=np.int32)

    sliced = split(array, .15, .4, .05, .9)
    im = Image.fromarray((sliced).astype(np.uint8))

    img_path = os.path.join(PathStorage.temp_prediction_png_second, "{:05d}".format(index) + ".png")

    im.save(img_path)
    
    if(index  % 25 == 0):
        print(index)

In [None]:
data_dir = PathStorage.temp_prediction_png_second

batch_size = 16
img_height = 350
img_width = 350

form_prediction_set = tf.keras.preprocessing.image_dataset_from_directory(
  data_dir,
  labels=None,
  validation_split=None,
  image_size=(img_height, img_width),
  batch_size=batch_size,
  color_mode='rgb',
  shuffle=False)

print(form_prediction_set)

In [None]:
form_result = form_model.predict(form_prediction_set)
form_result


In [None]:
form_class_names=['Update', 'Renewal', 'Initial']

for i in range(0, 100):
    score = tf.nn.softmax(form_result[i])

    print(str(i) + " " + form_class_names[np.argmax(score)] + " " + str(np.max(score) * 100))

In [None]:
form_result_series = pd.Series(np.argmax(form_result, axis=1))
form_result_series = form_result_series.replace({0 : 2, 1 : 0, 2 : 1})

print(form_docs.iloc[:, 1])
print(form_result_series)

form_docs.iloc[:, 1] = form_result_series


doc_data.loc[form_docs.index, "Action"] = form_docs["Action"] 


In [None]:
doc_data.to_csv(PathStorage.docdata_sheet)

# Third Level
## Renewal Processing

In [None]:
doc_data = pd.read_csv(PathStorage.docdata_sheet, index_col=0)
form_model = keras.models.load_model("DocCNNThirdLevel.keras")
form_model.get_config()


In [None]:
def split(array, start, end, left = 0, right = 1):
    start_pos = int(array.shape[0] * start)
    end_pos = int(array.shape[0] * end)
    left_pos = int(array.shape[1] * left)
    right_pos = int(array.shape[1] * right)
    return array[start_pos:end_pos,left_pos:right_pos,:]

In [None]:
form_mask = (doc_data["Action"] == 0) | (doc_data["Action"] == 2)
form_docs = doc_data[form_mask]
form_docs.tail(5)

In [None]:
shutil.rmtree(PathStorage.temp_prediction_png_third)
os.mkdir(PathStorage.temp_prediction_png_third)

for index, row in form_docs.iterrows():
    pdf_path = row.loc["Path"]
    doc = pymupdf.open(pdf_path)
    page = doc.load_page(0)
    pix = page.get_pixmap(dpi=250)

    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    array = np.asarray(image, dtype=np.int32)

    sliced = split(array, .19, .35, .05, .55)
    im = Image.fromarray((sliced).astype(np.uint8))

    img_path = os.path.join(PathStorage.temp_prediction_png_third, "{:05d}".format(index) + ".png")

    im.save(img_path)
    
    if(index  % 25 == 0):
        print(index)

In [None]:
data_dir = PathStorage.temp_prediction_png_second

batch_size = 16
img_height = 350
img_width = 350

form_prediction_set = tf.keras.preprocessing.image_dataset_from_directory(
  data_dir,
  labels=None,
  validation_split=None,
  image_size=(img_height, img_width),
  batch_size=batch_size,
  color_mode='rgb',
  shuffle=False)

print(form_prediction_set)

In [None]:
form_result = form_model.predict(form_prediction_set)
form_result


In [None]:
form_class_names=['Update', 'Renewal']

for i in range(0, 100):
    score = tf.nn.softmax(form_result[i])

    print(str(i) + " " + form_class_names[np.argmax(score)] + " " + str(np.max(score) * 100))

In [None]:
form_result_series = pd.Series(np.argmax(form_result, axis=1))
form_result_series = form_result_series.replace({0 : 2, 1 : 0})

print(form_docs.iloc[:, 1])
print(form_result_series)

form_docs.iloc[:, 1] = form_result_series


doc_data.loc[form_docs.index, "Action"] = form_docs["Action"] 


In [None]:
doc_data.to_csv(PathStorage.docdata_sheet)