In [1]:
import os
import importlib
import logging
importlib.reload(logging)
import framework
importlib.reload(framework)
import bert_ner
importlib.reload(bert_ner)
import infer_bert_classifier
importlib.reload(infer_bert_classifier)
import bert_utils
importlib.reload(bert_utils)
import pandas as pd
from framework import DataCuration, FeatureEngineering
from bert_ner import TaskNER, FeatureEngineeringNER, BERTNER

# Define some constants and configurations
logging.getLogger().setLevel(logging.INFO)
ACCESS_TOKEN = 'WUpGevbWC9lsnTW8quNUtmWRdAEM89'

Using TensorFlow backend.


Set up the task details. This notebook handles NER (for labeling person and company names)

In [2]:
DATASET = 'w2' # supports w2 and resume
TASK_CONFIG = {
    'task': 'ner',
    'num_labels': 3,
    'labels_dict': {'person' : 0, 'org' : 1, 'none': 2}
}

task = TaskNER(TASK_CONFIG)

Set paths for datasets and goldens (local or ib, both work).
Specify configurations

In [3]:
W2_DATA = [
   '/Users/ahsaasbajaj/Documents/Data/w2-instabase/flow/s2_map_records'
]
W2_GOLDEN = [
   '/Users/ahsaasbajaj/Documents/Data/w2-instabase/golden/goldens.csv'
]

GOLDEN_CONFIG = {
    'path': W2_GOLDEN,
    'is_local': True,
    'index_field_name':'filename',
    'file_type': 'csv',
    'identifier': 'file'
}
DATASET_CONFIG = {
    'path': W2_DATA,
    'is_local': True, 
    'file_type': 'ibocr',
    'identifier': lambda path: os.path.basename(path).split('.ibocr')[0],
    'convert2txt': True
}

data = DataCuration(ACCESS_TOKEN, DATASET_CONFIG, GOLDEN_CONFIG)

INFO:root:Loading dataset from /Users/ahsaasbajaj/Documents/Data/w2-instabase/flow/s2_map_records
INFO:root:142 files loaded
INFO:root:Loading goldens from /Users/ahsaasbajaj/Documents/Data/w2-instabase/golden/goldens.csv
INFO:root:Total files Goldens: (154, 25)
INFO:root:Total files found in the source with unique index: (142, 25)
INFO:root:Processing 142 IBOCR files to txt


In [4]:
PROCESSING_CONFIG = {
    'X_DIST_THRESHOLD': 200
}

DATA_ARGS = {
    'task': task,
    'dataset': data,
    'candidates_fields': {
        'person':'employee_name',
        'org':'employer_name'
    }
}

data.generate_candidates_phrases(PROCESSING_CONFIG)
data.compare_candidates_and_goldens(DATA_ARGS['candidates_fields'])

INFO:root:Generating candidates for 142 files
INFO:root:For X_DIST_THRESHOLD configuraion: 200
INFO:root:total files: 142
person names found in candidates: 130
org names found in candidates: 69



Generate test data from goldens (from actual persons and company names) or from ibocr (using candidate phrases generated by processIBOCR2candidatePhrases())

In [5]:
fe = FeatureEngineeringNER(DATA_ARGS)
test_data_from_goldens = fe.generate_test_samples_from_goldens() # single dataframe
test_data_from_candidates = fe.generate_test_samples_from_candidates() # dict{'filename' : dataframe}

Loading fine-tuned model for inference. These models were separately trained using GPUs

In [6]:
MODEL_PATHS = {
    'w2' : '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/w2/no-address/5/model.pt', # trained on public w2 from Kaggle
    'public': '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/public/no-address/200/model.pt' # trained on public names repo
}

TRAINING_ARGS = {
    'model_file_or_path' : MODEL_PATHS['w2'],
    'num_labels': TASK_CONFIG['num_labels'],
    'gpu': False,
}

model = BERTNER(DATA_ARGS, TRAINING_ARGS)

Setup model evaluator and evaluate either using test_data generated from goldens (test_data_from_goldens) or all candidate strings (test_data_from_candidates). Below code runs BERT inference and performs extraction, also calculating Recall, Precision, F1 by comparing with goldens

In [7]:
# Predictions
# output_golden = model.predict(test_data_from_goldens) # single dataframe 

# print('Sample outputs: ', output_golden.head())
# model.analyze_golden_result(output_golden)


# Do only for debugging and getting quick results
test_data_from_candidates = FeatureEngineering.get_subset_for_debugging(test_data_from_candidates, sample_size=2)

output = model.predict(test_data_from_candidates) # output is a dictionary
print('Number of files: ', len(output.keys()))
results = model.analyze_result(output)

INFO:root:inferring BERT classifier for file last_year_w2_1494968117938.PDF
INFO:root:inferring BERT classifier for file last_year_w2_1494968749784.PDF
INFO:root:For field person, recall: 1.0000, precision: 0.6667, F1: 0.8000 
INFO:root:For field org, recall: 0.5000, precision: 0.2500, F1: 0.3333 
Number of files:  2


In [8]:
model.demo(results)

INFO:root:Field type: person
INFO:root:filename: last_year_w2_1494968117938.PDF
INFO:root:{'DARICK J ENDECOTT'}
INFO:root:filename: last_year_w2_1494968749784.PDF
INFO:root:{'HEWLETT PACKARD ENTERPRISE', 'PATRICK J WAGNER', 'Local:'}
INFO:root:Field type: org
INFO:root:filename: last_year_w2_1494968117938.PDF
INFO:root:{'SUI/SDI', 'FUSION LED INC'}
INFO:root:filename: last_year_w2_1494968749784.PDF
INFO:root:{'COMPANY.', 'COMPANY'}
