In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
text = """SELLER   BUYER

JCFRE Supporting Foundation, a Kansas
not-for-profit 501(c)(3) corporation   

ICOP DIGITAL, INC.

By:  Merlys Berenborn, Pres. JCFRE   By:  David C. Owen
Date: 2/7/07   Date: 1/25/07
Mailing Address: 5801 W. 115th St., Overland Park, KS 66211   Mailing Address: 16801 W. 116th, Lenexa, KS 66219
Telephone: 913-327-8134   Telephone: 913-338-5550
 
National Christian Foundation Real Property, Inc. a
Georgia for-profit 501(c)(3) corporation

By:  Paula K. Segars, VP
Date: 2/2/07
Mailing Address:

 

1100 Johnson Ferry Rd., Suite
900, Atlanta, GA 30342

Telephone:  404-591-1770"""

class Utils:
    
    @staticmethod
    def int_to_roman(input, lower = False):  
            if type(input) != type(1):
                raise TypeError ("expected integer, got %s" % type(input))
            if not 0 < input < 4000:
                raise ValueError("Argument must be between 1 and 3999"   )
            ints = (1000, 900,  500, 400, 100,  90, 50,  40, 10,  9,   5,  4,   1)
            nums = ('M',  'CM', 'D', 'CD','C', 'XC','L','XL','X','IX','V','IV','I')
            nums = tuple(map(lambda x : x.lower(), nums)) if(lower) else nums
            result = ""
            for i in range(len(ints)):
                count = int(input / ints[i])
                result += nums[i] * count
                input -= ints[i] * count
            return result
    
#import spacy;
#from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Span, Doc, Token
from spacy.matcher import Matcher
import string


class Extensions(object):
    def __init__(self, suffix):
        self.token ="is_{}".format(suffix)
        self.span = "has_{}".format(suffix)
        self.doc = "has_{}".format(suffix)
        self.docs = "{}s".format(suffix)
    

class NER(object):
    
    def __init__(self, name):
        self.clazzName = ""
        self.label = ""
        self.patterns = []
        self.extensions = Extensions(name)


class CustomRecognizer(object):
    
    def __init__(self, ner):
        self.name = ner.clazzName
        self.ner = ner;
        self.matcher = Matcher(nlp.vocab)
        for pattern in ner.patterns:
            self.matcher.add(ner.label, None, pattern)
                
        Doc.set_extension(ner.extensions.docs, force = True, default=[])
        Token.set_extension(ner.extensions.token, force = True, default=False)
        _method = CustomRecognizer.make_method(ner.extensions.span)
        Doc.set_extension(ner.extensions.doc, getter=_method, force = True )
        Span.set_extension(ner.extensions.span, getter=_method, force = True)
        
        
        setattr(CustomRecognizer, ner.extensions.span, _method)

    def __call__(self, doc):
        matches = self.matcher(doc)        
        for i in range(len(matches)):
            match_id, curr_start, curr_end = matches[i]                        
            entity  = Span(doc, curr_start, curr_end, label=match_id)
            for token in entity :
                token._.set(self.ner.extensions.token, True)

            #doc._.cardinals.append(span)
            doc.ents = list(doc.ents) + [entity ]

        return doc
    
    @staticmethod
    def make_method(name):
        def _method(self, tokens):
            return any([t._.get(self.ner.extensions.span) for t in tokens])
        return _method

class CardinalRecognizer(object):
    name = "CardinalRecognizer"
    label = 'CUSTOM_CARDINAL'
  
    def __init__(self, nlp, patterns, label):        
        self.matcher = Matcher(nlp.vocab)
        for pattern in patterns:
            self.matcher.add(label, None, pattern)  

        Doc.set_extension('cardinals', force = True, default=[])
        Token.set_extension('is_cardinal', force = True, default=False)
        Doc.set_extension('has_cardinal', getter=self.has_cardinal, force = True )
        Span.set_extension('has_cardinal', getter=self.has_cardinal, force = True)
        
            
    def __call__(self, doc):
        matches = self.matcher(doc)        
        for i in range(len(matches)):
            match_id, curr_start, curr_end = matches[i]                        
            entity  = Span(doc, curr_start, curr_end, label=match_id)
            for token in entity :
                token._.set('is_cardinal', True)
                
            #doc._.cardinals.append(span)
            doc.ents = list(doc.ents) + [entity ]
            
        return doc
    
    def has_cardinal(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is a tech org. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_tech_org' attribute here,
        which is already set in the processing step."""
        return any([t._.get('is_cardinal') for t in tokens])
    
    
    DEFAULT_CARDINAL_PATTERN = [[{'ENT_TYPE': 'CARDINAL'},{'ORTH':'.'}] ] + \
    list(map(lambda i : [{'ORTH': '('}, {'ORTH': Utils.int_to_roman(i)}, {'ORTH': ')'}], range(1, 20))) + \
    list(map(lambda i : [{'ORTH': '('}, {'ORTH': Utils.int_to_roman(i, True)}, {'ORTH': ')'}], range(1, 20))) + \
    list(map(lambda char : [{'ORTH': '('}, {'ORTH': char}, {'ORTH': ')'}], string.ascii_letters)) + \
    list(map(lambda digit : [{'ORTH': '('}, {'ORTH': digit}, {'ORTH': ')'}], string.digits))
    
    
doc = nlp(text)

In [3]:
cardinal_recog = NER("cardinal");
cardinal_recog.patterns = [[{'ENT_TYPE': 'CARDINAL'},{'ORTH':'.'}] ] + \
    list(map(lambda i : [{'ORTH': '('}, {'ORTH': Utils.int_to_roman(i)}, {'ORTH': ')'}], range(1, 20))) + \
    list(map(lambda i : [{'ORTH': '('}, {'ORTH': Utils.int_to_roman(i, True)}, {'ORTH': ')'}], range(1, 20))) + \
    list(map(lambda char : [{'ORTH': '('}, {'ORTH': char}, {'ORTH': ')'}], string.ascii_letters)) + \
    list(map(lambda digit : [{'ORTH': '('}, {'ORTH': digit}, {'ORTH': ')'}], string.digits))
cardinal_recog.clazzName = "CardinalRecognizer"
cardinal_recog.label = 'CUSTOM_CARDINAL'
#ext = Extensions("a")

cardinalComponent = CustomRecognizer(cardinal_recog)

phone_recog = NER("phone")
phone_recog.patterns = [[{'ORTH': '('}, {'SHAPE': 'ddd'}, {'ORTH': ')'}, {'SHAPE': 'ddd'},
 {'ORTH': '-', 'OP': '?'}, {'SHAPE': 'dddd'}],
                       [ {'SHAPE': 'ddd'}, {'ORTH': '-', 'OP': '?'}, {'SHAPE': 'ddd'},
 {'ORTH': '-', 'OP': '?'}, {'SHAPE': 'dddd'}]]

phone_recog.clazzName = "PhoneRecognizer"
phone_recog.label = 'PHONE-NUMBER'
phoneComponent = CustomRecognizer(phone_recog)

In [4]:
#cardinalComponent = CardinalRecognizer(self.nlp, CardinalRecognizer.DEFAULT_CARDINAL_PATTERN , CardinalRecognizer.label)
print(nlp.pipe_names)

if cardinalComponent.name not in nlp.pipe_names:
    nlp.add_pipe(cardinalComponent, last = True)
if phoneComponent.name not in nlp.pipe_names:
    nlp.add_pipe(phoneComponent, last = True)
    

print(nlp.pipe_names)    

['tagger', 'parser', 'ner']
['tagger', 'parser', 'ner', 'CardinalRecognizer', 'PhoneRecognizer']


In [5]:
doc = nlp(text)
for ent in doc.ents:
    print(ent.label_, ent.text)

GPE Kansas
GPE 

ORG 501(c)(3
ORG DIGITAL, INC
ORG  Merlys Berenborn, Pres
PERSON David C. Owen
Date
CARDINAL 2/7/07
LOC   Date
EVENT 1/25/07

ORG Mailing
CARDINAL 5801
ORG W. 115th St.
GPE Overland Park
ORG KS
ORG Mailing
DATE 16801
PERSON W. 116th
ORG Lenexa
ORG KS 66219
Telephone
PHONE-NUMBER 913-327-8134
ORG   Telephone
PHONE-NUMBER 913-338-5550
ORG National Christian Foundation Real Property, Inc.
GPE 

GPE Georgia
CARDINAL 501(c)(3
NORP  
PERSON Paula K. Segars
GPE 
Date
CARDINAL 2/2/07
PERSON Johnson Ferry Rd
ORG Suite

GPE Atlanta
ORG GA
PRODUCT 30342


PHONE-NUMBER 404-591-1770


In [6]:
cardinal_recog.patterns

[[{'ENT_TYPE': 'CARDINAL'}, {'ORTH': '.'}],
 [{'ORTH': '('}, {'ORTH': 'I'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'II'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'III'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'IV'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'V'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'VI'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'VII'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'VIII'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'IX'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'X'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'XI'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'XII'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'XIII'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'XIV'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'XV'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'XVI'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'XVII'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'XVIII'}, {'ORTH': ')'}],
 [{'ORTH': '('}, {'ORTH': 'XIX'}, {'ORTH': ')'}],
 [{'ORTH': '(

In [None]:
phone_recog.patterns