In [1]:
import os
import importlib
import logging
importlib.reload(logging)
import framework
importlib.reload(framework)
import infer_bert_classifier
importlib.reload(infer_bert_classifier)
import bert_utils
importlib.reload(bert_utils)
from framework import *
import pandas as pd
# Define some constants and configurations
logging.getLogger().setLevel(logging.INFO)

ACCESS_TOKEN = 'WUpGevbWC9lsnTW8quNUtmWRdAEM89'

Using TensorFlow backend.


Set up the task details. This notebook handles NER (for labeling person and company names)

In [2]:
DATASET = 'w2' # supports w2 and resume
TASK_CONFIG = {
    'task': 'ner',
    'num_labels': 3,
    'labels_dict': {'person' : 0, 'org' : 1, 'none': 2}
}

task = Task_NER(TASK_CONFIG)

Set paths for datasets and goldens (local or ib, both work).
Specify configurations

In [3]:
GOLDEN_CONFIG = {
    'path': '/Users/ahsaasbajaj/Documents/Data/w2-instabase/golden/goldens.csv',
    'is_local': True,
    'index_field_name':'filename',
    'file_type': 'csv',
    'identifier': 'file'
}
DATASET_CONFIG = {
    'path': '/Users/ahsaasbajaj/Documents/Data/w2-instabase/flow/s2_map_records',
    'is_local': True, 
    'file_type': 'ibocr',
    'identifier': lambda path: os.path.basename(path).split('.ibocr')[0],
    'convert2txt': True
}

data = DataCuration(ACCESS_TOKEN, DATASET_CONFIG, GOLDEN_CONFIG)

INFO:root:Loading dataset from /Users/ahsaasbajaj/Documents/Data/w2-instabase/flow/s2_map_records
INFO:root:142 files loaded
INFO:root:Loading goldens from /Users/ahsaasbajaj/Documents/Data/w2-instabase/golden/goldens.csv
INFO:root:Total files Goldens: (154, 25)
INFO:root:Total files found in the source: (142, 25)
INFO:root:Processing 142 IBOCR files to txt


In [4]:
PROCESSING_CONFIG = {
    'X_DIST_THRESHOLD': 200
}

DATA_ARGS = {
    'task': task,
    'dataset': data,
    'candidates_fields': {
        'person':'employee_name',
        'org':'employer_name'
    }
}

data.generate_candidates_phrases(PROCESSING_CONFIG)
data.compare_candidates_and_goldens(DATA_ARGS['candidates_fields'])

INFO:root:Generating candidates for 142 files
INFO:root:For X_DIST_THRESHOLD configuraion: 200
INFO:root:total files: 142
person names found in candidates: 130
org names found in candidates: 69



Generate test data from goldens (from actual persons and company names) or from ibocr (using candidate phrases generated by processIBOCR2candidatePhrases())

In [5]:
fe = FeatureEngineering_NER(DATA_ARGS)
test_data_from_goldens = fe.generate_test_samples_from_goldens() # single dataframe
test_data_from_candidates = fe.generate_test_samples_from_candidates() # dict{'filename' : dataframe}

Loading fine-tuned model for inference. These models were separately trained using GPUs

In [6]:
MODEL_DICT = {
    'w2' : '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/w2/no-address/5/model.pt', # trained on public w2 from Kaggle
    'public': '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/public/no-address/200/model.pt' # trained on public names repo
}

TRAINING_ARGS = {
    'task': task,
    'model_file_or_path' : MODEL_DICT['w2'],
    'num_labels': TASK_CONFIG['num_labels'],
    'gpu': False,
    'use_goldens': False # True if test_data generated using generate_test_samples_from_goldens(), else False
}

test_data = test_data_from_candidates # choose from test_data_from_goldens or test_data_from_candidates

# Do only for debugging and getting quick results
test_data = FeatureEngineering.get_subset_for_debugging(test_data, sample_size=2)

Setup model evaluator and evaluate either using test_data generated from goldens (test_data_from_goldens) or all candidate strings (test_data_from_candidates). Below code runs BERT inference and performs extraction, also calculating Recall, Precision, F1 by comparing with goldens

In [12]:

model_evaluator = ModelEvaluator(TRAINING_ARGS)

# Predictions (on test data)
output = model_evaluator.run_evaluation(test_data)

# Analyze results generated
if TRAINING_ARGS['use_goldens']:
    # output is a single df
    print('Sample outputs: ', output.head())
    model_evaluator.analyze_golden_result(output)
else:
    print('Number of files: ', len(output.keys()))
    # output is a dictionary
    results = model_evaluator.analyze_overall_result(output, data.golden, DATA_ARGS['candidates_fields'])


INFO:root:inferring BERT classifier for file last_year_w2_1494968326840.PDF
INFO:root:inferring BERT classifier for file last_year_w2_1493919686919.PDF
INFO:root:For field person, recall: 1.0000, precision: 0.5000, F1: 0.6667 
INFO:root:For field org, recall: 0.0000, precision: 0.5000, F1: 0.0000 
Number of files:  2


Print extractions

In [8]:
'''
# Print results
for typ in results:
    print('Field type: ', typ)
        for key in results[typ]:
            print('filename: ', key)
            print(results[typ][key])
'''

"\n# Print results\nfor typ in results:\n    print('Field type: ', typ)\n        for key in results[typ]:\n            print('filename: ', key)\n            print(results[typ][key])\n"

Load ouputs from refiner flow (after step 4 producing single out.ibocr) and get extractions and metrics (Recall, Precision, F1) by comparing with goldens

In [17]:
W2_REFINER_RESULT_PATH = '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/refiner_results/w2.ibocr'
RESUME_REFINER_RESULT_PATH = '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/refiner_results/resume.ibocr'

REFINER_RESULT_PATH = W2_REFINER_RESULT_PATH

In [18]:
model_evaluator = ModelEvaluator(TRAINING_ARGS)
results = model_evaluator.analyze_refiner_results(REFINER_RESULT_PATH, data.golden, DATA_ARGS['candidates_fields'])

INFO:root:
Person Name Scores
INFO:root:For model names_vontell, recall: 0.7465, precision: 0.4180, F1: 0.5359 
INFO:root:For model names_token_matcher, recall: 0.6549, precision: 0.4602, F1: 0.5405 
INFO:root:For model names_spacy, recall: 0.0915, precision: 0.0034, F1: 0.0066 
INFO:root:
Org Name Scores
INFO:root:For model org_spacy, recall: 0.0775, precision: 0.0012, F1: 0.0023 


Print extractions

In [11]:
''' 
# Print results
for typ in results:
    print('Field type: ', typ)
    for model in results[typ]:
        print('model type: ', model)
        for key in results[typ][model]:
            print('filename: ', key)
            print(results[typ][model][key])
'''            

" \n# Print results\nfor typ in results:\n    print('Field type: ', typ)\n    for model in results[typ]:\n        print('model type: ', model)\n        for key in results[typ][model]:\n            print('filename: ', key)\n            print(results[typ][model][key])\n"