Reference: http://mlreference.com/named-entities-spacy

In [1]:
cd ../

/Users/nguyen/projects/archi


In [2]:
import spacy
from spacy import displacy

In [3]:
def skip_and_print(*args):
    """ Act like print(), but skip a line before printing. """
    print('\n' + str(args[0]), *args[1:])

def print_table(rows):
    """ Print `rows` with content-based column widths. """
    col_widths = [
        max(len(str(value)) for value in col)
        for col in zip(*rows)
    ]
    total_width = sum(col_widths) + len(col_widths) - 1
    fmt = ' '.join('%%-%ds' % width for width in col_widths)
    print(fmt % tuple(rows[0]))
    print('~' * total_width)
    for row in rows[1:]:
        print(fmt % tuple(row))

In [4]:
nlp = spacy.load('en')

document_string = "I like to visit Park Tea House in Berkeley."

skip_and_print('Working with string: "%s"' % document_string)
doc = nlp(document_string)


Working with string: "I like to visit Park Tea House in Berkeley."


In [5]:
rows = [['Name', 'Start', 'End', 'Label']]

# Each `ent` object is an instance of the `Span` class.
for ent in doc.ents:
    rows.append([
        ent.text,        # The str of the named entity phrase.
        ent.start_char,  # Source str index of the first char.
        ent.end_char,    # Source str index of the last+1 char.
        ent.label_       # A str label for the entity type.
    ])

In [6]:
skip_and_print('Named entities found:')
print_table(rows)


Named entities found:
Name           Start End Label
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Park Tea House 16    30  ORG  
Berkeley       34    42  GPE  


In [7]:
for ent in doc.ents:
    skip_and_print('Recovering "%s":' % ent)
    print(document_string)
    print(' ' * ent.start_char + '^' * len(ent.text))

    # How you can access the entity subtring using a slice:
    substr = document_string[ent.start_char: ent.end_char]


Recovering "Park Tea House":
I like to visit Park Tea House in Berkeley.
                ^^^^^^^^^^^^^^

Recovering "Berkeley":
I like to visit Park Tea House in Berkeley.
                                  ^^^^^^^^


In [8]:
ibc_string = """Where occupants egress from one or more rooms, areas or spaces through others, 
the design occupant load shall be the combined occupant load of interconnected accessory or intervening spaces. 
Design of egress path capacity shall be based on the cumulative portion of occupant loads of all rooms, areas 
or spaces to that point along the path of egress travel."""

In [9]:
doc = nlp(ibc_string)

In [10]:
rows = [['Name', 'Start', 'End', 'Label']]

# Each `ent` object is an instance of the `Span` class.
for ent in doc.ents:
    rows.append([
        ent.text,        # The str of the named entity phrase.
        ent.start_char,  # Source str index of the first char.
        ent.end_char,    # Source str index of the last+1 char.
        ent.label_       # A str label for the entity type.
    ])

In [11]:
skip_and_print('Named entities found:')
print_table(rows)


Named entities found:
Name Start End Label   
~~~~~~~~~~~~~~~~~~~~~~~
one  28    31  CARDINAL

    79    80  GPE     

    192   193 GPE     

    303   304 GPE     


In [13]:
for ent in doc.ents:
    skip_and_print('Recovering "%s":' % ent)
    print(ibc_string)
    print(' ' * ent.start_char + '^' * len(ent.text))

    # How you can access the entity subtring using a slice:
    substr = ibc_string[ent.start_char: ent.end_char]


Recovering "one":
Where occupants egress from one or more rooms, areas or spaces through others, 
the design occupant load shall be the combined occupant load of interconnected accessory or intervening spaces. 
Design of egress path capacity shall be based on the cumulative portion of occupant loads of all rooms, areas 
or spaces to that point along the path of egress travel.
                            ^^^

Recovering "
":
Where occupants egress from one or more rooms, areas or spaces through others, 
the design occupant load shall be the combined occupant load of interconnected accessory or intervening spaces. 
Design of egress path capacity shall be based on the cumulative portion of occupant loads of all rooms, areas 
or spaces to that point along the path of egress travel.
                                                                               ^

Recovering "
":
Where occupants egress from one or more rooms, areas or spaces through others, 
the design occupant load shall b

In [16]:
doc.ents[0].label_

'CARDINAL'