# Document Classification Test (HeavyWater Machine Learning Challenge)


**Problem Statement**

We process documents related to mortgages, aka everything that happens to originate a mortgage that you don't see as a borrower. Often times the only access to a document we have is a scan of a fax of a print out of the document. Our system is able to read and comprehend that document, turning a PDF into structured business content that our customers can act on.

This dataset represents the output of the OCR stage of our data pipeline ...  Each word in the source is mapped to one unique value in the output. If the word appears in multiple documents then that value will appear multiple times. The word order for the dataset comes directly from our OCR layer, so it should be roughly in order.

**Mission**

Train a document classification model. Deploy your model to a public cloud platform (AWS/Google/Azure/Heroku) as a webservice, send us an email with the URL to you github repo, the URL of your publicly deployed service so we can submit test cases and a recorded screen cast demo of your solution's UI, its code and deployment steps. Also, we use AWS so we are partial to you using that ... just saying.

## Setup

### Library import
We import all the required Python libraries

In [4]:
from time import asctime, gmtime, localtime, perf_counter
print(asctime(localtime()))

t0 = perf_counter()

# from platform import node
from collections import Counter, OrderedDict
import gc		# garbage collection module
import os
import pathlib
import pickle
# import pprint
from random import random
import sys

print("Python version: ", sys.version_info[:])
print("Un-versioned imports:\n")
prefixStr = ''
if 'collections' in sys.modules:
    print(prefixStr + 'collections', end="")
    prefixStr = ', '
if 'gc' in sys.modules:
    print(prefixStr + 'gc', end="")
    prefixStr = ', '
if 'os' in sys.modules:
    print(prefixStr + 'os', end="")
    prefixStr = ', '
if 'pathlib' in sys.modules:
    print(prefixStr + 'pathlib', end="")
    prefixStr = ', '
if 'pickle' in sys.modules:
    print(prefixStr + 'pickle', end="")
    prefixStr = ', '
if 'pprint' in sys.modules:
    print(prefixStr + 'pprint', end="")
    prefixStr = ', '
if 'random' in sys.modules:
    print(prefixStr + 'random', end="")
    prefixStr = ', '
if 'sys' in sys.modules:
    print(prefixStr + 'sys', end="")
    prefixStr = ', '

duVersion = None
from dateutil import __version__ as duVersion
from dateutil.parser import parse
import numpy as np

mdVersion = None
from modin import __version__ as mdVersion
import modin.pandas as pd
# import pandas as pd
ppVersion = None
# from pandas_profiling import ProfileReport
# from pandas_profiling import __version__ as ppVersion

import graphviz

gpdVersion = None
# from geopandas import __version__ as gpdVersion
# import geopandas as gpd

shpVersion = None
from shapely import __version__ as shpVersion
from shapely.geometry import Point

scVersion = None
from scipy import __version__ as scVersion
import scipy.sparse as sp

skVersion = None
from sklearn import __version__ as skVersion
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import ComplementNB
from sklearn.utils import class_weight

tfVersion = None
# from tensorflow import __version__ as tfVersion
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras import Input
# from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, concatenate
# from tensorflow.keras.models import Model
# from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
# from tensorflow.keras.utils import plot_model

jlVersion = None
# from joblib import __version__ as jlVersion
# from joblib import dump, load

# Visualizations

mpVersion = None
from matplotlib import __version__ as mpVersion
import matplotlib.pyplot as plt

import seaborn as sns
import colorcet as cc

print("\n")
if 'colorcet' in sys.modules:
    print(f"colorcet: {cc.__version__}", end="\t")
if 'cufflinks' in sys.modules:
    print(f"cufflinks: {cf.__version__}", end="\t")
if 'dateutil' in sys.modules:
    print(f"dateutil: {duVersion}", end="\t")
if 'geopandas' in sys.modules:
    print(f"geopandas: {gpdVersion}", end="\t")
if 'graphviz' in sys.modules:
    print(f"graphviz: {duVersion}", end="\t")
if 'joblib' in sys.modules:
    print(f"joblib: {jlVersion}", end="\t")
if 'matplotlib' in sys.modules:
    print(f"matplotlib: {mpVersion}", end="\t")
if 'modin' in sys.modules:
    print(f"modin: {mdVersion}", end="\t")
if 'numpy' in sys.modules:
    print(f"numpy: {np.__version__}", end="\t")
if 'pandas' in sys.modules:
    print(f"pandas: {pd.__version__}", end="\t")
if 'pandas_profiling' in sys.modules:
    print(f"pandas_profiling: {ppVersion}", end="\t")
if 'scipy' in sys.modules:
    print(f"scipy: {scVersion}", end="\t")
if 'seaborn' in sys.modules:
    print(f"seaborn: {sns.__version__}", end="\t")
if 'shapely' in sys.modules:
    print(f"shapely: {shpVersion}", end="\t")
if 'sklearn' in sys.modules:
    print(f"sklearn: {skVersion}", end="\t")
if 'tensorflow' in sys.modules:
    print(f"tensorflow: {tfVersion}", end="\t")
# if '' in sys.modules:
#     print(f": {.__version__}", end="\t")
Δt = perf_counter() - t0
print(f"\n\nΔt: {Δt: 4.1f}s.")

%matplotlib inline

# Options for pandas
pd.options.display.max_columns = 30
pd.options.display.max_rows = 50

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

Fri Jan  8 13:50:58 2021
Python version:  (3, 6, 9, 'final', 0)
Un-versioned imports:

collections, gc, os, pathlib, pickle, pprint, random, sys

colorcet: 1.0.0	dateutil: 2.8.1	graphviz: 2.8.1	joblib: None	matplotlib: 3.3.0	modin: 0.8.2	numpy: 1.19.4	pandas: 0.8.2	scipy: 1.4.1	seaborn: 0.11.0	shapely: 1.7.0	sklearn: 0.22.1	

Δt:  0.0s.


### Local library import
We import all the required local libraries libraries

In [6]:
# Include local library paths
sys.path.append('/home/mark/work/Mlib') # uncomment and fill to import local libraries
# Import local libraries
# from utility import DataSci as util
# from utility import symSpellPlus as ssp
# from utility import KerasEarlyStopCallback as esCallback
from plotHelpers import plotHelpers as ph

**Next two lines are for pretty output for all prints in a Pandas cell, not just the last**

In [7]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Prepare Data

### Data import

Describe sources of data (with links, where possible)

In [9]:
rootPath = pathlib.Path.cwd().parent
dataPath = rootPath / 'data'
modelPath = rootPath / 'model'

### Data Munging

Beat data into shape, impute, select variables (delete colinear features, etc.)

### Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.

## Modeling

Put here the core of the notebook. Feel free di further split this section into subsections.

# References
