# Creating Models for IBDOCs

In [None]:
import os
import logging
import importlib
import ibnlp
importlib.reload(ibnlp)
from ibnlp import PrototypeDataset, BERTUtils, IBDOCFeaturizer, ModelContext, product_dict, PUNC_TABLE

# Define some constants and configurations
logging.getLogger().setLevel(logging.INFO)

ACCESS_TOKEN = 'INSERT IB TOKEN HERE'

## 1. Load Dataset + Goldens
*Note that this can take a few minutes to run*

In [None]:
OMF_DATA = [
    'ib_sales/OMF/fs/Instabase%20Drive/test/other-paystubs/Batch1/out/s3_map_records/',
    'ib_sales/OMF/fs/Instabase%20Drive/test/other-paystubs/Batch2/out/s3_map_records/',
    'ib_sales/OMF/fs/Instabase%20Drive/test/other-paystubs/Batch3/out/s1_process_files/',
    'ib_sales/OMF/fs/Instabase%20Drive/test/other-paystubs/Batch4/out/s1_process_files/'
]
OMF_GOLDEN = [
    './omf_batch_1.csv',
    './omf_batch_2.csv',
    './omf_batch_3.csv',
    './omf_batch_4.csv'
]
OMF_MAPPING = ['ssn', 'last_4_ssn', 'per_end', 'pay_date', 'file', 'doc_type', 'template_name', 'per_begin', 'employee_name', 'employer_name', 'per_ss_tax', 'ytd_ss_tax', 'per_medicare_tax', 'ytd_medicare_tax', 'per_gross_pay', 'ytd_gross_pay', 'per_net_pay', 'ytd_net_pay']
OMF_GOLDEN_CONFIG = {
    'file_type': 'csv',
    'skip_first_row': True,
    'mapping': OMF_MAPPING,
    'identifier': 'file'
}
OMF_DATASET_CONFIG = {
    'file_type': 'ibdoc',
    'identifier': lambda path: os.path.basename(path).split('.ibdoc')[0]
}

omf_paystubs = PrototypeDataset(ACCESS_TOKEN, OMF_DATA, OMF_DATASET_CONFIG, OMF_GOLDEN, OMF_GOLDEN_CONFIG)
omf_paystubs.golden

## 2. Model and Model Selection Setup

In [None]:
import itertools
import numpy as np
import sklearn
import time
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Start by varying 
params = {
    'batch_size': [32],#, 64, 128],
    'epochs': [25, 50],
    'max_num_tokens': [5],
    'max_token_distance': [None],
    #'embedding_type': ['glove', 'bert'],
    'cardinal_only': [False],
    'balance_targets': [True],
    'additional_features': [[], ['is_number', 'is_company_indicator']],
    'pre_processing': [[], ['lower_case', 'remove_punc'], ['lower_case'], ['remove_punc']]
}

def create_sample_data(dataset, model_context):
    # Balance samples by removing some non-entity labeled datapoints
    samples, targets, warnings = dataset.generate_spatial_samples('employer_name', context)
    pos_idx = np.where(targets == 1)[0]
    num_pos_samples = len(pos_idx)

    neg_idxs_all = np.where(targets == 0)[0]
    np.random.shuffle(neg_idxs_all)
    neg_idx = neg_idxs_all[:num_pos_samples]

    idx_to_use = np.concatenate((pos_idx, neg_idx))

    filtered_samples = samples[idx_to_use]
    filtered_targets = targets[idx_to_use]

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(filtered_samples, filtered_targets, test_size=0.3, random_state=0)
    return (X_train, X_test, y_train, y_test)

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
import matplotlib.pyplot as plt

def train_model(X_train, X_test, y_train, y_test, model_context):

    # Neural network
    model = Sequential()
    model.add(Dense(512, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=model_context.epochs, batch_size=model_context.batch_size)
    return model, history

def evaluate_model(model, history, model_context):
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss']) 
    plt.title('Model loss') 
    plt.ylabel('Loss') 
    plt.xlabel('Epoch') 
    plt.legend(['Train', 'Test'], loc='upper left') 
    plt.show()
    return history.history['val_acc'][-1]
    
    
print("Trying on {} combinations of parameters".format(len(list(product_dict(**params)))))
results = []
for parameters in product_dict(**params):
    start = time.time()
    context = ModelContext(**parameters)
    print('Trying on following context:\n\t{}'.format(parameters))
    samples = create_sample_data(omf_paystubs, context)
    print('Input shape: {}'.format(samples[0].shape))
    model, history = train_model(*samples, context)
    acc = evaluate_model(model, history, context)
    res = (acc, context, history, model, time.time() - start)
    print((res[0], res[1], res[-1]))
    results.append(res)
    
for i, result in enumerate(results):
    print((i, result[0], result[1]))
    

In [None]:
print(results[np.argmax([r[0] for r in results])])

In [None]:
# Do some testing
def evaluate(dataset, model, context, threshold=0.60, distance_threshold=1.5):
    
    results = {}
    
    for dataset_file in list(dataset.dataset.keys()):
        try:
            ibdoc = dataset.dataset[dataset_file].get_joined_page()[0] # 20, 54, 70
            featurizer = IBDOCFeaturizer(ibdoc)
            fvs = featurizer.get_feature_vectors(context)
    #         print(ibdoc.get_text())
    #         print('=================================')
            predictions = model.predict(fvs)
            predictions = predictions.tolist()
            sequences = [[]]
            for i, classification in enumerate(predictions):
                if classification[0] > threshold:
                    token_to_add = featurizer.get_all_tokens()[i]
                    to_add_start, to_add_height = token_to_add['start_x'], token_to_add['line_height']
                    if len(sequences[-1]) > 0 and (to_add_start - sequences[-1][-1]['end_x']) <= distance_threshold * to_add_height:
                        sequences[-1].append(token_to_add)
                    else:
                        sequences.append([token_to_add])
                elif len(sequences[-1]) > 0:
                    sequences.append([])
            companies = [' '.join([ss['word'] for ss in s]) for s in sequences if len(s) > 1]
            results[dataset_file] = companies
        except Exception as e:
            print(e)
    return results

best_model_idx = np.argmax([r[0] for r in results])
best_model = results[best_model_idx][3]
best_context = results[best_model_idx][1]
found_companies = evaluate(omf_paystubs, best_model, best_context)
print(found_companies)

In [None]:
total = 0
found_count = 0
for cfile in found_companies:
    try:
        expected = omf_paystubs.golden.at[cfile, 'employer_name']
    except Exception as e:
        print(e)
        continue
    found = '\n\t\t'.join(found_companies[cfile])
    print(cfile[-10:])
    print('\t Found:\n\t\t{}'.format(found))
    print('\t Expected:\n\t\t{}'.format(expected))
    if expected:  
        total += 1
    expected_san = expected.lower().strip().translate(PUNC_TABLE)
    actual_san = [c.lower().strip().translate(PUNC_TABLE) for c in found_companies[cfile]]
    is_contained = any([(expected_san in a) for a in actual_san]) or any([(a in expected_san) for a in actual_san])
    if expected_san in actual_san or is_contained:
        found_count += 1
    else:
        print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
print(total)
print(found_count)
print('Recall: {}'.format(float(found_count)/float(total)))

In [None]:
# omf_paystubs.evaluate({k: {'employer_name': found_companies[k]} for k in found_companies}, {}, fields=['employer_name'])

In [None]:
ibdoc = omf_paystubs.dataset[list(omf_paystubs.dataset.keys())[54]].get_joined_page()[0] # 20, 54, 70
featurizer = IBDOCFeaturizer(ibdoc)
fvs = featurizer.get_feature_vectors(context)
print(ibdoc.get_text())
print('=================================')
predictions = model.predict(fvs)
predictions = predictions.tolist()
sequences = [[]]
for i, classification in enumerate(predictions):
    if classification[0] > 0.99:
        token_to_add = featurizer.get_all_tokens()[i]
        to_add_start, to_add_height = token_to_add['start_x'], token_to_add['line_height']
        if len(sequences[-1]) > 0 and (to_add_start - sequences[-1][-1]['end_x']) <= 1.5 * to_add_height:
            sequences[-1].append(token_to_add)
        else:
            sequences.append([token_to_add])
    elif len(sequences[-1]) > 0:
        sequences.append([])
companies = [' '.join([ss['word'] for ss in s]) for s in sequences if len(s) > 1]
print(companies)