In [42]:
import os
import importlib
import logging
importlib.reload(logging)
import framework
importlib.reload(framework)
import refiner
importlib.reload(refiner)
import infer_bert_classifier
importlib.reload(infer_bert_classifier)
import bert_utils
importlib.reload(bert_utils)
import pandas as pd
import webbrowser
from framework import DataCuration
from refiner import Refiner, TaskNER

# Define some constants and configurations
logging.getLogger().setLevel(logging.INFO)
ACCESS_TOKEN = 'WUpGevbWC9lsnTW8quNUtmWRdAEM89'

Set up the task details. This notebook handles NER (for labeling person and company names)

In [43]:
DATASET = 'w2' # supports w2 and resume
TASK_CONFIG = {
    'task': 'ner',
    'num_labels': 3,
    'labels_dict': {'person' : 0, 'org' : 1, 'none': 2}
}

task = TaskNER(TASK_CONFIG)

In [44]:
W2_DATA = [
   '/Users/ahsaasbajaj/Documents/Data/w2-instabase/flow/s2_map_records'
]
W2_GOLDEN = [
   '/Users/ahsaasbajaj/Documents/Data/w2-instabase/golden/goldens.csv'
]

GOLDEN_CONFIG = {
    'path': W2_GOLDEN,
    'is_local': True,
    'index_field_name':'filename',
    'file_type': 'csv',
    'identifier': 'file'
}
DATASET_CONFIG = {
    'path': W2_DATA,
    'is_local': True, 
    'file_type': 'ibocr',
    'identifier': lambda path: os.path.basename(path).split('.ibocr')[0],
    'convert2txt': True
}

data = DataCuration(ACCESS_TOKEN, DATASET_CONFIG, GOLDEN_CONFIG)

INFO:root:Loading dataset from /Users/ahsaasbajaj/Documents/Data/w2-instabase/flow/s2_map_records
INFO:root:142 files loaded
INFO:root:Loading goldens from /Users/ahsaasbajaj/Documents/Data/w2-instabase/golden/goldens.csv
INFO:root:Total files Goldens: (154, 25)
INFO:root:Total files found in the source with unique index: (142, 25)
INFO:root:Processing 142 IBOCR files to txt


Set paths for datasets and goldens (local or ib, both work).
Specify configurations

In [45]:
DATASET = 'w2' # supports w2 and resume
TASK_CONFIG = {
    'task': 'ner',
    'num_labels': 3,
    'labels_dict': {'person' : 0, 'org' : 1, 'none': 2}
}

task = TaskNER(TASK_CONFIG)

In [46]:
W2_REFINER_RESULT_PATH = '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/refiner_results/w2.ibocr'
RESUME_REFINER_RESULT_PATH = '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/refiner_results/resume.ibocr'

DATA_ARGS = {
    'task': task,
    'dataset': data,
    'candidates_fields': {
        'person':'employee_name',
        'org':'employer_name'
    }
}
TRAINING_ARGS = {
    'model_file_or_path' : W2_REFINER_RESULT_PATH,
    'num_labels': TASK_CONFIG['num_labels'],
}

MODELS_TO_EVAL = {
    'models': ['names_vontell', 'names_token_matcher'],
    'spacy_models': ['names_spacy', 'org_spacy'],

    'person_name_models': ['names_vontell', 'names_token_matcher', 'names_spacy'],
    'org_name_models': ['org_spacy'],
}

Load ouputs from refiner flow (after step 4 producing single out.ibocr) and get extractions and metrics (Recall, Precision, F1) by comparing with goldens

In [47]:
model = Refiner(DATA_ARGS, TRAINING_ARGS, MODELS_TO_EVAL)
results = model.analyze_results()

INFO:root:
Person Name Scores
INFO:root:For model names_vontell, recall: 0.7465, precision: 0.4180, F1: 0.5359 
INFO:root:For model names_token_matcher, recall: 0.6549, precision: 0.4602, F1: 0.5405 
INFO:root:For model names_spacy, recall: 0.0915, precision: 0.0034, F1: 0.0066 
INFO:root:
Org Name Scores
INFO:root:For model org_spacy, recall: 0.0775, precision: 0.0012, F1: 0.0023 


Print extractions

In [48]:
print(results.keys())
print(results['person'].keys())
# print(results['person']['names_vontell'].keys())

dict_keys(['person', 'org'])
dict_keys(['names_vontell', 'names_token_matcher', 'names_spacy'])


In [49]:
data.dataset.keys()

dict_keys(['last_year_w2_1494610589440.PDF', 'last_year_w2_1493919658342.PDF', 'last_year_w2_1494609473036.PDF', 'last_year_w2_1493919676693.PDF', 'last_year_w2_1494967766712.PDF', 'last_year_w2_1494967275596.PDF', 'last_year_w2_1495565064610.PDF', 'last_year_w2_1494271104951.PDF', 'last_year_w2_1494974543429.PDF', 'last_year_w2_1494968098673.PDF', 'last_year_w2_1494968081996.PDF', 'last_year_w2_1494968808776.pdf', 'last_year_w2_1495142754897.PDF', 'last_year_w2_1494265175121.PDF', 'last_year_w2_1494261908342.PDF', 'last_year_w2_1494968101626.PDF', 'last_year_w2_1494976044553.PDF', 'last_year_w2_1494271068960.pdf', 'last_year_w2_1494342956999.pdf', 'last_year_w2_1495120461121.PNG', 'last_year_w2_1494271121485.PDF', 'last_year_w2_1494976364261.PDF', 'last_year_w2_1494271204620.PDF', 'last_year_w2_1495120702277.PDF', 'last_year_w2_1495059592755.JPEG', 'last_year_w2_1495059646952.PNG', 'last_year_w2_1495563052751.PDF', 'last_year_w2_1494968839220.PDF', 'last_year_w2_1494262066281.PDF', 'l

In [50]:
DEMO_FILE = 'last_year_w2_1494265175121.PDF'
model.demo(results, DEMO_FILE)

INFO:root:Field type: person
INFO:root:model type: names_vontell
INFO:root:{'MUDD-MOSHER X', 'SUE MUDD-MOSHER', 'PAMELA SUE'}
INFO:root:

INFO:root:model type: names_token_matcher
INFO:root:{'PAMELA SUE'}
INFO:root:

INFO:root:model type: names_spacy
INFO:root:{'Nonqualified', '736', 'HOSPITAL', 'VIRGINIA', "Employer's", '120', 'Return OMB', 'Third-party', 'Wage', '20', '2a', 'VA', 'Medicare', 'Allocated', "Employee's", 'Filed', 'PAMELA', 'SeService', '18', '6', 'GINIA', 'C', '2412.74', 'NORTH'}
INFO:root:

INFO:root:

INFO:root:Field type: org
INFO:root:model type: org_spacy
INFO:root:{'Federal', 'CHESAPEAKE VA', '16', 'HOSPITAL', 'Tax Statement', '(', 'RECORDS', '2', 'EIN', 'Locality', 'IRS', 'FEDERAL Tax', 'CHESAPEAKE', 'State', '13', 'SPRING', 'Form W-2', 'Social', 'SUE MUDD', 'Medicare', 'Copy B.', 'Treasury-', 'Allocated', "Employee's", 'the Internal Revenue Service', 'VA 23320', 'Form', 'BEACH VA', 'SickParty', 'Employee'}
INFO:root:

INFO:root:



dict_keys(['person', 'org'])

dict_keys(['names_vontell', 'names_token_matcher', 'names_spacy'])

In [63]:
DIR_PATH = '/Users/ahsaasbajaj/Documents/Data/w2-instabase/pdf'
FILE_PATH = DIR_PATH + '/' + DEMO_FILE 


In [64]:

webbrowser.open_new(r'file:' + FILE_PATH)

True