In [313]:
import os
import importlib
import logging
importlib.reload(logging)
import framework
importlib.reload(framework)
import bert_ner
importlib.reload(bert_ner)
import infer_bert_classifier
importlib.reload(infer_bert_classifier)
import bert_utils
importlib.reload(bert_utils)
import pandas as pd
from framework import DataCuration, FeatureEngineering
from bert_ner import TaskNER, FeatureEngineeringNER, BERTNER

# Define some constants and configurations
logging.getLogger().setLevel(logging.INFO)
ACCESS_TOKEN = 'WUpGevbWC9lsnTW8quNUtmWRdAEM89'

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ahsaasbajaj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Set up the task details. This notebook handles NER (for labeling person and company names)

In [314]:
DATASET = 'w2' # supports w2 and resume

# also supports 'address' as 4th category in task
TASK_CONFIG = {
    'task': 'ner',
    'num_labels': 3,
    'labels_dict': {'person' : 0, 'org' : 1, 'none': 2}
}

task = TaskNER(TASK_CONFIG)

Set paths for datasets and goldens (local or ib, both work).
Specify configurations

In [315]:
# Path to publicly available W2 forms (Kaggle)
W2_DATA = [
   '/Users/ahsaasbajaj/Documents/Data/W2/W2_Clean_DataSet_01_20Sep2019/ocr/out-pdf'
]
W2_GOLDEN = [
   '/Users/ahsaasbajaj/Documents/Data/W2/W2_Clean_DataSet_01_20Sep2019/golden/goldens.csv'
]

GOLDEN_CONFIG = {
    'path': W2_GOLDEN,
    'is_local': True,
    'index_field_name':'File_BaseName',
    'file_type': 'csv',
    'identifier': 'file'
}
DATASET_CONFIG = {
    'path': W2_DATA,
    'is_local': True, 
    'file_type': 'ibocr',
    'identifier': lambda path: os.path.basename(path).split('.ibocr')[0].split('.pdf')[0],
    'convert2txt': True
}

data = DataCuration(ACCESS_TOKEN, DATASET_CONFIG, GOLDEN_CONFIG)

INFO:root:Loading dataset from /Users/ahsaasbajaj/Documents/Data/W2/W2_Clean_DataSet_01_20Sep2019/ocr/out-pdf
INFO:root:1994 files loaded
INFO:root:Loading goldens from /Users/ahsaasbajaj/Documents/Data/W2/W2_Clean_DataSet_01_20Sep2019/golden/goldens.csv
INFO:root:Total files Goldens: (2000, 45)
INFO:root:Total files found in the source with unique index: (1994, 45)
INFO:root:Processing 1994 IBOCR files to txt


In [316]:
assert(len(data.dataset.keys()) == data.golden.shape[0])

In [317]:
PROCESSING_CONFIG = {
    'X_DIST_THRESHOLD': 200,
    'RANDOM_SEQ_LEN': 5
}

# use as negative filter to generate random sequences
employer_address = ['Employer\'s Street Address', 'Employer\'s City-State-Zip']
employee_address = ['Employee Street Address', 'Employee\'s City-State-Zip']
other_location = ['Locality Name_1', 'State_1', 'Locality Name_2', 'State_2']

DATA_ARGS = {
    'task': task,
    'dataset': data,
    'candidates_fields': {
        'org':'Employer\'s Name',
        'person':'Employee Name',
        'address1': employer_address,
        'address2': employee_address,
        'address3': other_location
    },
    'use_random_seq': True, # if False, then use phrases generated by generate_candidates_phrases() for 'None' class in classifier training data
    'mode': 'person-org' # person-org or person-org-address, need to change label_dict in Task accordingly
}

data.generate_candidates_phrases(PROCESSING_CONFIG)
data.compare_candidates_and_goldens(DATA_ARGS['candidates_fields'])

INFO:root:Generating candidates for 1994 files
INFO:root:For X_DIST_THRESHOLD configuraion: 200
INFO:root:total files: 1994
person names found in candidates: 1987
org names found in candidates: 1992



In [318]:
data.split_train_test()

INFO:root:Total samples 1994, training samples: 1395, Test Samples: 599


Generate training data 

In [319]:
fe = FeatureEngineeringNER(DATA_ARGS)
train_data, test_data = fe.create_train_test_data(PROCESSING_CONFIG)
train_data.head(20)

INFO:root:Generating labeled data for dataset with shape (1395, 45)
INFO:root:Cleaned text size 1395
INFO:root:Random Sequence text size 1395
INFO:root:Mode person-org
INFO:root:Generating labeled data for dataset with shape (599, 45)
INFO:root:Cleaned text size 599
INFO:root:Random Sequence text size 599
INFO:root:Mode person-org
INFO:root:train data: (4185, 2), test test: (1797, 2)


Unnamed: 0,context,label
395,file tax negligence penalty sanction,2
451,"Adams, Mitchell and Jones Group",1
445,Lauren Nolan,0
307,Graves-Jenkins PLC,1
835,Alexander Ellis,0
89,Preston Ltd Group,1
513,Jasmine Cabrera,0
173,code H State,2
961,Gould-Campbell Ltd,1
755,identification number othercompensation Federa...,2


Generate training dataset from public names (of persons and companies). This only supports 'person-org' and has no addresses. This uses candidates generated by generate_candidates_phrases()

In [320]:
DATA_ARGS = {
    'task': task,
    'filepaths': {
        'org':'/Users/ahsaasbajaj/Documents/Data/names/companies_sorted.csv',
        'person1':'/Users/ahsaasbajaj/Documents/Data/names/first_names.all.txt',
        'person2': '/Users/ahsaasbajaj/Documents/Data/names/last_names.all.txt',
        'none': '/Users/ahsaasbajaj/Documents/Data/names/none_phrases200.csv' # generated from datacuration.candidates with person, org, address filtered out as in FeatureEngineeringNER().create_train_test_data
    },
    'name_field': {
        'person': 'name', # field name inside text files or csv
        'org': 'name',
        'none': 'context'
    },

    'mode': 'person-org' # person-org or person-org-address, need to change label_dict in Task accordingly
}

In [321]:
train_data, test_data = FeatureEngineeringNER.create_train_test_data_from_files(DATA_ARGS)

INFO:root:Total 7173426 company names available
INFO:root:Total 164432 first names, 98391 last names available
INFO:root:Total 44738 none phrases available
INFO:root:Selecting minimum of three categories available, min count: 44738
INFO:root:Total samples 134213, training samples: 93995, Test Samples: 40218


In [322]:
train_data.head(10)

Unnamed: 0,context,label
96235,this information is being furnished to the int...,2
50822,positively reiki and tarot,1
89224,vogel,1
112330,"16 state wages , tips , etc. 17 state income t...",2
90072,114044 . 61,2
25121,fermín kehrer,0
72291,starr al inc,1
4429,enely stoffey,0
90015,d control number,2
21567,tanairi schoenfeld,0
