# Auto-Detection With Custom Named Entity Recognition (NER) 


In [1]:
# pip install -U pip setuptools wheel
# pip install -U spacy
# python -m spacy download en_core_web_sm

In [2]:
# library imports
import pandas as pd
import spacy
from spacy import displacy
from spacy.tokens import DocBin
from datetime import datetime
from tqdm import tqdm
import json
import re

## Annotation

In [3]:
# this dictionary will contain all annotated examples
collective_dict = {'TRAINING_DATA': []}

def structure_training_data(text, kw_list):
    results = []
    entities = []
    
    # search for instances of keywords within the text (ignoring letter case)
    for kw in tqdm(kw_list):
        search = re.finditer(kw, text, flags=re.IGNORECASE)
        
        # store the start/end character positions
        all_instances = [[m.start(),m.end()] for m in search] 
        
        # if the callable_iterator found matches, create an 'entities' list
        if len(all_instances)>0:
            for i in all_instances:
                start = i[0]
                end = i[1]
                entities.append((start, end, "SERVICE"))
            
        # alert when no matches are found given the user inputs
        else:
            print("No pattern matches found. Keyword:", kw)
                
    # add any found entities into a JSON format within collective_dict
    if len(entities)>0:
        results = [text, {"entities": entities}]
        collective_dict['TRAINING_DATA'].append(results)
        return

## Preparing the Training Data

In [4]:
text1 = "BigTime Care has a broad array of service offerings for Philadelphia-area clientele. \
For 50 years, we have specialized in landscaping and lawn mowing. \
We also provide seasonal snow removal services for local commercial and residential properties. \
Call any time to schedule a consultation!"

text2 = "Scrub-O Cleaning connects independent professionals with customers. \
We offer the full range of customizable cleaning services that you may need now and in \
the future, and our team is ready to begin working for you today! We offer quality maid \
services and housekeeping across the San Francisco Bay Area."

text3 = "Locally owned and operated, Trust Roofing has the best roofing services in \
Philadelphia and the surrounding areas. Whatever the season, you can count on us to provide \
you with the best possible roof repair. We will work with any given roof replacement material, \
including asphalt shingles and metal roofs. Siding replacement services are also available."

text4 = "Based in Pittsburgh PA, Tammy's Branch Cuts is a family owned and managed smalled \
businesses founded in 1994. We specialize in full-service landscape design, including \
tree removal, lawn care to protect your existing plants, and comprehensive hardscaping for \
patios, walkways, and outdoor living spaces. Contact us today!"

# TRAINING
structure_training_data(text1, ['landscaping', 'lawn mowing', 'snow removal'])
structure_training_data(text2, ['cleaning services', 'maid services', 'housekeeping'])
structure_training_data(text3, ['roofing', 'roof repair', 'siding replacement'])
structure_training_data(text4, ['landscape design', 'tree removal', 'lawn care', 'hardscaping'])

100%|██████████████████████████████████████████| 3/3 [00:00<00:00, 3000.22it/s]
100%|██████████████████████████████████████████| 3/3 [00:00<00:00, 3000.22it/s]
100%|██████████████████████████████████████████| 3/3 [00:00<00:00, 3000.22it/s]
100%|██████████████████████████████████████████| 4/4 [00:00<00:00, 1999.91it/s]


In [5]:
# show a view of the resulting training data
collective_dict

{'TRAINING_DATA': [['BigTime Care has a broad array of service offerings for Philadelphia-area clientele. For 50 years, we have specialized in landscaping and lawn mowing. We also provide seasonal snow removal services for local commercial and residential properties. Call any time to schedule a consultation!',
   {'entities': [(122, 133, 'SERVICE'),
     (138, 149, 'SERVICE'),
     (176, 188, 'SERVICE')]}],
  ['Scrub-O Cleaning connects independent professionals with customers. We offer the full range of customizable cleaning services that you may need now and in the future, and our team is ready to begin working for you today! We offer quality maid services and housekeeping across the San Francisco Bay Area.',
   {'entities': [(108, 125, 'SERVICE'),
     (238, 251, 'SERVICE'),
     (256, 268, 'SERVICE')]}],
  ['Locally owned and operated, Trust Roofing has the best roofing services in Philadelphia and the surrounding areas. Whatever the season, you can count on us to provide you with 

## Saving the Training Data 

In [6]:
# option to timestamp and save training data to local directory
def save_data(filename, data):
    
    # add a timestamp to the filename
    now = datetime.now() 
    datetime_str = now.strftime("%m_%d_%Y-%H.%MMT") # using military time
    
    split = filename.split('.')
    split[0] = split[0]+'--'+datetime_str+'.'
    filename = ''.join(split)
    
    with open (filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

# save_data('Training_Data', collective_dict)

## Preparing SpaCy Doc Objects

In [7]:
# define our training data to TRAIN_DATA
TRAIN_DATA = collective_dict['TRAINING_DATA']

In [8]:
# create a blank model
nlp = spacy.blank('en')

def create_training(TRAIN_DATA):
    db = DocBin()
    for text, annot in tqdm(TRAIN_DATA):
        doc = nlp.make_doc(text)
        ents = []

        # create span objects
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract") 

            # skip if the character indices do not map to a valid span
            if span is None:
                print("Skipping entity.")
            else:
                ents.append(span)
                # handle erroneous entity annotations by removing them
                try:
                    doc.ents = ents
                except:
                    # print("BAD SPAN:", span, "\n")
                    ents.pop()
        doc.ents = ents

        # pack Doc objects into DocBin
        db.add(doc)
    return db

TRAIN_DATA_DOC = create_training(TRAIN_DATA)

100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 222.21it/s]


In [9]:
# Export results (here I add it to a TRAIN_DATA folder within the directory)
TRAIN_DATA_DOC.to_disk("./TRAIN_DATA/TRAIN_DATA.spacy")

## Training a Blank NER Model (with CLI)

<b>STEP 1:</b> Copy/paste the full contents of spaCy's default config file <br>
into a file named base_config.cfg within your folder directory. <br>
https://spacy.io/usage/training#config

<b>STEP 2:</b> Define filepaths for the train and dev variables in base_config.cfg <br>

<b>STEP 3:</b> Open Command Prompt, and cd over to the directory of base_config.cfg <br>

<b>STEP 4</b>: Run the following to create the necessary .cfg file: <br>
<i> python -m spacy init fill-config base_config.cfg config.cfg </i> <br>
\- (a config.cfg file will appear in the current directory) </i>

<b>STEP 5:</b> Next run the following to begin training: <br>
<i> python -m spacy train config.cfg --output ./output </i> <br>
\- (the model results will appear in a new folder called output) 

## Model Demonstrations

In [10]:
# load the trained model
nlp_output = spacy.load("output/model-best")
    
def model_visualization(text):

    # pass our test instance into the trained pipeline
    doc = nlp_output(text)

    # customize the label colors
    colors = {"SERVICE": "linear-gradient(90deg, #E1D436, #F59710)"}
    options = {"ents": ["SERVICE"], "colors": colors}

    # visualize the identified entities
    displacy.render(doc, style="ent", 
                    options=options)

    # print out the identified entities
    print("IDENTIFIED ENTITIES:")
    [print(ent.text) for ent in doc.ents if ent.label_ == "SERVICE"]

### Below are 3 test cases demonstrating the model's capabilites. 
 
 <font size="3">Notice that the model was trained on merely four training instances, yet still <br> generalizes to detect various services that were <b>not</b> part of the original training. 
    <br><br> 
    Of course, much more training is needed to increase model accuracy. 
 </font>


In [11]:
test1 = "At Perfection Landscapes LLC, we are committed to protecting the health of trees \
and shrubs in urban and suburban areas. We work with clients to provide expertise in all areas \
of tree care, stump removal, and construction-related tree preservation. Our trained experts \
also have years of experience with insect control. Call us today for a consultation!"

model_visualization(test1)

IDENTIFIED ENTITIES:
tree care
stump removal
tree preservation


In [12]:
test2 = "J.K. Commercial Cleaning is dedicated to creating clean, safe, and healthy \
environments. We offer cleaning programs that are tailored to fit your business's individual needs. \
This includes janitorial services, and consistent quality deep cleaning services for both commercial and residential spaces."

model_visualization(test2)

IDENTIFIED ENTITIES:
cleaning programs
janitorial
cleaning services


In [13]:
test3 = "Small leaks in your bathroom or kitchen can ruin your day. For extensive residential plumbing services \
at affordable prices, contact J.K. Plumbing today. We do all work according to local, state, and \
city codes and are backed by over a decade of experience."

model_visualization(test3)

IDENTIFIED ENTITIES:
plumbing
