In [40]:
import os
import importlib
import logging
importlib.reload(logging)
import framework
importlib.reload(framework)
import infer_bert_classifier
importlib.reload(infer_bert_classifier)
import bert_utils
importlib.reload(bert_utils)
from framework import Usecase, DataCuration, FeatureEngineering, ModelEvaluator
import pandas as pd
import random
# Define some constants and configurations
logging.getLogger().setLevel(logging.INFO)

ACCESS_TOKEN = 'WUpGevbWC9lsnTW8quNUtmWRdAEM89'

Set up the usecase details

In [41]:
USECASE = 'ner'
USECASE_CONFIG = {
    'dataset': 'w2',
    'num_labels': 3,
    'labels_dict': {'person' : 0, 'org' : 1, 'none': 2}
}

usecase = Usecase(USECASE)
usecase.set_config(USECASE_CONFIG)

Set paths for datasets and goldens (local or ib, both work).
Specify configurations

In [42]:
DATAS = {
    ('ner', 'w2') : '/Users/ahsaasbajaj/Documents/Data/w2-instabase/flow/s2_map_records'
}

GOLDENS = {
  ('ner', 'w2') : '/Users/ahsaasbajaj/Documents/Data/w2-instabase/golden/goldens.csv'
}

DATASET = USECASE_CONFIG['dataset']
DATA = DATAS[(USECASE, DATASET)]
GOLDEN = GOLDENS[(USECASE, DATASET)]

GOLDEN_CONFIG = {
    'is_local': True,
    'index_field_name':'filename',
    'file_type': 'csv',
    'skip_first_row': True,
    'identifier': 'file'
}
DATASET_CONFIG = {
    'is_local': True, 
    'file_type': 'ibocr',
    'identifier': lambda path: os.path.basename(path).split('.ibocr')[0],
    'convert2txt': False
}

w2 = DataCuration(ACCESS_TOKEN, DATA, DATASET_CONFIG, GOLDEN, GOLDEN_CONFIG)   


INFO:root:Loading dataset from /Users/ahsaasbajaj/Documents/Data/w2-instabase/flow/s2_map_records
INFO:root:Loading goldens from /Users/ahsaasbajaj/Documents/Data/w2-instabase/golden/goldens.csv
INFO:root:Total files Goldens: (154, 25)
INFO:root:Total files found in the source: (142, 25)


In [43]:
PROCESSING_CONFIG = {
    'X_DIST_THRESHOLD': 200
}
CANDIDATES_FIELDS = {
    'person':'employee_name',
    'org':'employer_name'
}
w2.process_IBOCR_to_candidate_phrases(DATASET_CONFIG, PROCESSING_CONFIG)
w2.compare_candidates_and_goldens(PROCESSING_CONFIG, CANDIDATES_FIELDS)

INFO:root:Generating candidates for 142 files
INFO:root:For X_DIST_THRESHOLD configuraion: 200
INFO:root:total files: 142
person names found in candidates: 130
org names found in candidates: 69



Generate test data from goldens (from actual persons and company names) or from ibocr (using candidate phrases generated by processIBOCR2candidatePhrases())

In [44]:
fe = FeatureEngineering(usecase, w2, CANDIDATES_FIELDS)
test_data_from_goldens = fe.generate_test_samples_from_goldens() # single dataframe
test_data_from_candidates = fe.generate_test_samples_from_candidates() # dict{'filename' : dataframe}

Loading fine-tuned model for inference. These models were separately trained using GPUs

In [45]:
MODEL_TRAIN_DATA = 'w2'     # options for NER: (w2 or public)

MODELS = {
    ('ner', 'w2') : '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/w2/no-address/5/model.pt',
    ('ner', 'public') : '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/public/no-address/200/model.pt'
}

MODEL_CONFIG = {
    'model_file_or_path' : MODELS[(USECASE, MODEL_TRAIN_DATA)],
    'num_labels': USECASE_CONFIG['num_labels'],
}

EVAL_CONFIG = {
    'gpu': False,
    'use_goldens': False # True if test_data generated using generate_test_samples_from_goldens(), else False
}
test_data = test_data_from_candidates

# ToDo
# testing -- include as feature alongwith support with sampling from df
test_data = dict(random.sample(test_data.items(), 1))


Setup model evaluator and evaluate either using test_data generated from goldens (test_data_from_goldens) or all candidate strings (test_data_from_candidates). Below code runs BERT inference and performs extraction, also calculating Recall, Precision, F1 by comparing with goldens

In [46]:

model_evaluator = ModelEvaluator(usecase)
model_evaluator.set_config(MODEL_CONFIG, EVAL_CONFIG)

# Predictions (on test data)
output = model_evaluator.run_evaluation(test_data)

# Analyze results generated
if EVAL_CONFIG['use_goldens']:
    # output is a single df
    print('Sample outputs: ', output.head())
    model_evaluator.analyze_golden_result(output)
else:
    print('Number of files: ', len(output.keys()))
    # output is a dictionary
    results = model_evaluator.analyze_overall_result(output, w2.golden, CANDIDATES_FIELDS)


INFO:root:inferring BERT classifier for file last_year_w2_1494968553744.PDF
INFO:root:For field person, recall: 1.0000, precision: 0.2500, F1: 0.4000 
INFO:root:For field org, recall: 1.0000, precision: 0.5000, F1: 0.6667 
Number of files:  1


Print extractions

In [47]:
'''
# Print results
for typ in results:
    print('Field type: ', typ)
        for key in results[typ]:
            print('filename: ', key)
            print(results[typ][key])
'''

"\n# Print results\nfor typ in results:\n    print('Field type: ', typ)\n        for key in results[typ]:\n            print('filename: ', key)\n            print(results[typ][key])\n"

Load ouputs from refiner flow (after step 4 producing single out.ibocr) and get extractions and metrics (Recall, Precision, F1) by comparing with goldens

In [48]:
REFINER_RESULTS = {
    ('ner', 'w2') : '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/refiner_results/w2.ibocr',
    ('ner', 'resume') : '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/refiner_results/resume.ibocr'
}

REFINER_RESULT_PATH = REFINER_RESULTS[(USECASE, MODEL_TRAIN_DATA)]

In [49]:
model_evaluator = ModelEvaluator(usecase)
results = model_evaluator.analyze_refiner_results(REFINER_RESULT_PATH, w2.golden, CANDIDATES_FIELDS)

INFO:root:
Person Name Scores
INFO:root:For model names_vontell, recall: 0.7465, precision: 0.4180, F1: 0.5359 
INFO:root:For model names_token_matcher, recall: 0.6549, precision: 0.4602, F1: 0.5405 
INFO:root:For model names_spacy, recall: 0.0915, precision: 0.0034, F1: 0.0066 
INFO:root:
Org Name Scores
INFO:root:For model org_spacy, recall: 0.0775, precision: 0.0012, F1: 0.0023 


Print extractions

In [50]:
''' 
# Print results
for typ in results:
    print('Field type: ', typ)
    for model in results[typ]:
        print('model type: ', model)
        for key in results[typ][model]:
            print('filename: ', key)
            print(results[typ][model][key])
'''            

" \n# Print results\nfor typ in results:\n    print('Field type: ', typ)\n    for model in results[typ]:\n        print('model type: ', model)\n        for key in results[typ][model]:\n            print('filename: ', key)\n            print(results[typ][model][key])\n"