In [None]:
!pip install spacy==2.2.4

In [2]:
import pandas as pd, spacy, random
from spacy.util import minibatch, compounding
from IPython.display import display, Image
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import json
from itertools import groupby
from google.colab import drive
import os
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/ArmorDoc_Challenge/')

Mounted at /content/drive


# Load the training data

## Training data format:
A list of tuples, where each tuple contains 1 data point for a text as shown below.

The numbers means starting and ending position of the entities in hte text or string. For example 'STREET' starts at position 210 and ends at position 229 of the string.

In [None]:
SAMPLE_TRAIN_DATA = [('BE SUBORDINATED UPON THE REFINANCING OF ANY PRIOR MORTGAGE\nTHIS DEED OF TRUST...',
 {'entities': [(210, 229, 'STREET'), (231, 239, 'CITY'), (241, 243, 'STATE'), (244, 249, 'ZIP')]})]

## Loading Wrangled Data

In [5]:
page_df = pd.read_csv('./data/extracted_data.csv',index_col=0)

In [6]:
page_df.head()

Unnamed: 0,page_number,image_file_names,image_file_paths,text
0,0,page0.png,/data/png_images/page0.png,\n\n \n\nADJUSTABLE RATE NOTE\n(HOME EQUITY C...
1,1,page1.png,/data/png_images/page1.png,ADJUSTABLE RATE NOTE\n\n(HOME EQUITY CONVERSIO...
2,2,page2.png,/data/png_images/page2.png,ADJUSTABLE RATE SECOND NOTE\n\n(HOME EQUITY CO...
3,3,page3.png,/data/png_images/page3.png,\n\napSOsTaBLe RATE SECONDSOTE\n(HOME EQUITY ...
4,4,page4.png,/data/png_images/page4.png,\n\n \n\nADJUSTABLE RATE SECOND NOTE\n(HOME E...


In [7]:
page_df.text[0]

' \n\n \n\nADJUSTABLE RATE NOTE\n(HOME EQUITY CONVERSION)\nSTATE OF VA\n\nAugust 29, 2008\n\nPROPERTY ADDRESS FHA Case Number: PF\nLoan\n\nMIN Number:\nNorfolk, VA 23507\nNorfolk (City) COUNTY\n\n1. DEFINITIONS :\n"Borrower" means each person signing at the end of this Note. "Lender" means EverBank Reverse Mortgage LLC and its\nsuccessors and assigns. "Secretary" means the Secretary of Housing and Urban Development or his or her authorized\nrepresentatives.\n\n2. BORROWER\'S PROMISE TO PAY; INTEREST\n\nIn return for amounts to be advanced by Lender up to a maximum principal amount of Four Hundred Seventy Thousand Two\nHundred Fifty and 00/100 Dollars ($470,250.00), to or for the benefit of Borrower under the terms of a Home Equity\nConversion Loan Agreement dated August 29, 2008 ("Loan Agreement"), Borrower promises to pay to the order of Lender a\nprincipal amount equal to the sum of all Loan Advances made under the Loan Agreement with interest. All amounts advanced by\nLender, plus i

## Preparing Data to Be Exported to Label Studio

To increase accuracy we will extract only the paragraph containing the Lenders name to reduce unnecesary extranneous information for the NER model to have to process.

In [None]:
abbreviated_pages = []

for page in all_pages:
  temp_page = ''
  temp_page = page[page.index("Lend"):]
  temp_page = temp_page[:temp_page.index('\n\n')]
  abbreviated_pages.append(temp_page)

abbreviated_pages 

In [8]:
# Download spaCy models:
models = {'en_core_web_sm': spacy.load("en_core_web_sm")}

# This function converts spaCy docs to the list of named entity spans in Label Studio compatible JSON format:
def doc_to_spans(doc):
    tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in doc]
    results = []
    entities = set()
    for entity, group in groupby(tokens, key=lambda t: t[-1]):
        if not entity:
            continue
        group = list(group)
        _, start, _ = group[0]
        word, last, _ = group[-1]
        text = ' '.join(item[0] for item in group)
        end = last + len(word)
        results.append({
            'from_name': 'label',
            'to_name': 'text',
            'type': 'labels',
            'value': {
                'start': start,
                'end': end,
                'text': text,
                'labels': [entity]
            }
        })
        entities.add(entity)

    return results, entities

In [118]:
data_to_export_to_label_studio = abbreviated_pages[:10]

entities = set()
tasks = []
for text in data_to_export_to_label_studio:
    predictions = []
    for model_name, nlp in models.items():
        doc = nlp(text)
        spans, ents = doc_to_spans(doc)
        entities |= ents
        predictions.append({'model_version': model_name, 'result': spans})
    tasks.append({
        'data': {'text': text},
        'predictions': predictions
    })

# Save Label Studio tasks.json
print(f'Save {len(tasks)} tasks to "label_studio_tasks.json"')
with open('./data/label_studio_tasks.json', mode='w') as f:
    json.dump(tasks, f, indent=2)
    
# Save class labels as a txt file
print('Named entities are saved to "label_studio_named_entities.txt"')
with open('./data/label_studio_named_entities.txt', mode='w') as f:
    f.write('\n'.join(sorted(entities)))

Save 10 tasks to "label_studio_tasks.json"
Named entities are saved to "label_studio_named_entities.txt"


## Importing Data From Label Studio
I utilized Label Studio to edit the NER labels to use as a gold standard to retrain the model.

#### Formating Data For NER Model
Following format as detailed in the sample train data:

In [125]:
gold_standard = pd.read_json('./data/gold_standard_data_3.json')

In [126]:
TRAIN_DATA = []

for idx in range(len(gold_standard)):
  dict_ = {}
  entity_list = []
  for i in gold_standard.iloc[idx]['annotations'][0]['result']:
    if 'labels' in i['value'].keys():
      # entity_list.append((i['value']['start'],i['value']['end'],i['value']['labels'][0]))
      print(i)
    print('new_entry')

{'value': {'start': 14, 'end': 43, 'text': 'EverBank Reverse Mortgage LLC', 'labels': ['ORG']}, 'id': 'q09ZaqTM8G', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}
new_entry
{'value': {'start': 98, 'end': 140, 'text': 'Secretary of Housing and Urban Development', 'labels': ['ORG']}, 'id': 'lTqbKDbdI0', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}
new_entry
{'value': {'start': 14, 'end': 43, 'text': 'EVERBANK REVERSE MORTGAGE LLC', 'labels': ['ORG']}, 'id': 'BBiI8BNuuu', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}
new_entry
{'value': {'start': 14, 'end': 43, 'text': 'EverBank Reverse Mortgage LLC', 'labels': ['ORG']}, 'id': 'D2iIknM22B', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}
new_entry
{'value': {'start': 98, 'end': 140, 'text': 'Secretary of Housing and Urban Development', 'labels': ['ORG']}, 'id': 'tA-j-XQJCt', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}
new_entry
{'value': {'start': 14, 'end': 43, 'text': 'Eve

In [127]:
TRAIN_DATA = []

for doc in range(len(gold_standard)):
  dict_ = {}
  entity_list = []
  for entity in gold_standard.iloc[doc]['annotations'][0]['result']:
    if 'labels' in entity['value'].keys():
      entity_list.append((entity['value']['start'],entity['value']['end'],entity['value']['labels'][0]))
  dict_['entities'] = entity_list
  TRAIN_DATA.append((gold_standard.iloc[idx]['data']['text'], dict_))

In [136]:
(TRAIN_DATA)

[('Lender" means Mortgage.Shop, LLC and its successors and\nassigns. “Secretary” means the Secretary of Housing and Urban Development or his or her authorized representatives.',
  {'entities': [(14, 43, 'ORG'), (98, 140, 'ORG')]}),
 ('Lender" means Mortgage.Shop, LLC and its successors and\nassigns. “Secretary” means the Secretary of Housing and Urban Development or his or her authorized representatives.',
  {'entities': [(18, 60, 'ORG')]}),
 ('Lender" means Mortgage.Shop, LLC and its successors and\nassigns. “Secretary” means the Secretary of Housing and Urban Development or his or her authorized representatives.',
  {'entities': [(18, 60, 'ORG')]}),
 ('Lender" means Mortgage.Shop, LLC and its successors and\nassigns. “Secretary” means the Secretary of Housing and Urban Development or his or her authorized representatives.',
  {'entities': [(31, 60, 'ORG'), (18, 60, 'ORG')]}),
 ('Lender" means Mortgage.Shop, LLC and its successors and\nassigns. “Secretary” means the Secretary of Housi

In [138]:
# this was necessary as there was an inexplicable overlapping entry. 
del TRAIN_DATA[3][1]['entities'][-1]

In [139]:
TRAIN_DATA[3][1]['entities']

[(31, 60, 'ORG')]

## Creating New NER Model
---



In [140]:
nlp = spacy.blank("en")
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)

In [141]:
ent_list = []

for page in TRAIN_DATA:
  ents = page[1]['entities']
  for i in ents:
    ent = i[2]
    if ent not in ent_list:
      ent_list.append(ent)

print(ent_list)    

for lb in ent_list: # Change the label.
    ner.add_label(lb)

print(ner.labels)    

## Training the NER model 
(code included in challenge prompt)

In [158]:
optimizer = nlp.begin_training()

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

with nlp.disable_pipes(*other_pipes):  # only train NER
    sizes = compounding(1.0, 4.0, 1.001)
    # batch up the examples using spaCy's minibatch
    for itn in range(50):
        random.shuffle(TRAIN_DATA)
        batches = minibatch(TRAIN_DATA, size = sizes)
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd = optimizer, drop = 0.35, losses = losses)
        print("Losses", losses)

Losses {'ner': 3.999446321405684}
Losses {'ner': 3.997885442548955}
Losses {'ner': 4.079382065115307}
Losses {'ner': 6.197864055589422}
Losses {'ner': 4.004673180881026}
Losses {'ner': 3.99909774867133}
Losses {'ner': 4.6169192830709544}
Losses {'ner': 3.657878009800444}
Losses {'ner': 3.9993480120871503}
Losses {'ner': 5.742971528824516}
Losses {'ner': 3.360816757950336}
Losses {'ner': 3.996324704527136}
Losses {'ner': 4.006573022724163}
Losses {'ner': 4.1771108272026325}
Losses {'ner': 4.0159895446119585}
Losses {'ner': 4.000199717978128}
Losses {'ner': 3.9765480091066268}
Losses {'ner': 5.015894234498647}
Losses {'ner': 3.9999311432973452}
Losses {'ner': 3.999867442931904}
Losses {'ner': 4.773667319145383}
Losses {'ner': 4.022534088648857}
Losses {'ner': 3.9984856032542773}
Losses {'ner': 3.999357412668066}
Losses {'ner': 7.799980811008604}
Losses {'ner': 6.283225081518098}
Losses {'ner': 3.9980052031481916}
Losses {'ner': 4.0024301579855}
Losses {'ner': 3.9972654337158}
Losses {'ne

# Saving the Model

In [146]:
folder_path = './data/trained_ner_model.pth'
nlp.to_disk(folder_path)

In [192]:
folder_path = './data/trained_ner_model.pth' #Change this
nlp = spacy.load(folder_path)

## Extracting Lender Names with Trained NER Model
##### Note: My Trained NER Model is Not Indentifying Any Named Entities
##### Note: There appears there is an bug in my code. I have not used an NER Model in this type of application in the past. I suspect I might not be using a larger enough Gold Standard Training Set, or I am not including enough types of "named entities" for the model to recognize. 

In [193]:
entity_list = []

for page in abbreviated_pages:
  page_entity_list = []
  for e in nlp(page).ents:
    page_entity_list.append(e.text)
  # appending only the first organization in the list as the first one is always the lender
  # and if there is a organization, it is always the secretary.
  if page_entity_list:
    entity_list.append(page_entity_list[0])
  else:
    entity_list.append(None)

In [197]:
print(entity_list)

[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]


## Extracting Lender "Named Entities" Using Out-of-the-Box Spacy Model

In [202]:
nlp = spacy.load("en_core_web_sm")

entity_list = []

for page in abbreviated_pages:
  page_entity_list = []
  for e in nlp(page).ents:
    page_entity_list.append(e.text)
  # appending only the first organization in the list as the first one is always the lender
  # and if there is a organization, it is always the secretary.
  if page_entity_list:
    entity_list.append(page_entity_list[0])
  else:
    entity_list.append(None)



## Exporting Lender Name Data As CSV File
Note: For the final CSV file I will submit for this exercise, I will use the Lender names captured with Regular Expressions, found in notebook #4.

In [209]:
extracted_lender_names_with_ner_df = page_df.copy()
extracted_lender_names_with_ner_df['lender'] = entity_list
extracted_lender_names_with_ner_df = extracted_lender_names_with_ner_df[['page_number', 'lender']]

# exporting csv
extracted_lender_names_with_ner_df.to_csv('./data/lender_names_extracted_with_NER_Model.csv')
extracted_lender_names_with_ner_df.head()

Unnamed: 0,page_number,lender
0,0,EverBank
1,1,LLC
2,2,Housing and Urban Development
3,3,Housing
4,4,Housing and Urban\nDevelopment
