In [3]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[K     |████████████████████████████████| 13.6 MB 1.4 MB/s eta 0:00:01    |████████▏                       | 3.5 MB 202 kB/s eta 0:00:50     |████████████████████████████▊   | 12.2 MB 2.0 MB/s eta 0:00:01


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [1]:
import spacy

In [6]:
nlp = spacy.load('en_core_web_sm')

In [11]:
text = "This likely means you're already running a local web server, so there's no need to make displaCy start another one. Instead, you should be able to replace displacy.serve with displacy.render to show the visualization."

In [12]:
doc = nlp(text)

In [13]:
list(doc.sents)

[This likely means you're already running a local web server, so there's no need to make displaCy start another one.,
 Instead, you should be able to replace displacy.serve with displacy.render to show the visualization.]

In [14]:
sentence_spans = list(doc.sents)
spacy.displacy.serve(sentence_spans, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [15]:
token_example = doc[0]
print(token_example.ent_type_)
print(token_example.pos_)
print(token_example.tag_)
print(token_example.vector)


DET
DT
[-0.13571987 -0.25953645  0.5980234   1.0543      0.9973282   0.20679954
 -1.1775682   0.9776583   0.827132   -0.40455437  0.40885547 -0.01581306
 -0.4600075   0.18371391 -0.6644426  -0.6934086   1.1224751  -1.0670991
  1.2296829   0.6481814  -0.2163907  -0.4381313  -0.8297271  -0.5973476
  0.5033427   0.35190928 -0.02126464 -0.9231523   0.24844706 -0.42874908
 -0.41593593 -0.303098   -0.09194261 -0.23272203 -0.35986033  0.59478164
 -0.14595515  0.75109375  2.1420653   0.8069137   0.6573932   0.01789705
  0.3098136  -0.642105   -0.11232674 -0.5436055  -0.5710929   0.22215533
 -0.6134101   0.66907096  0.54487944 -0.40347102  0.04423305  1.2583436
  0.72793984 -1.1403402  -0.26322478 -0.27574587 -0.14583966  0.03284772
 -0.97116137 -0.25193024 -0.47947413  1.7396077  -0.4497463  -0.5423039
 -0.3933073  -0.03255526  1.5368824   0.0155862   0.5394334   0.3945644
 -0.793931   -0.757059   -0.5574567   0.3093553   0.02905138 -0.62371695
 -1.1789527  -1.4453558   0.64246684 -0.27791414

In [16]:
dir(doc[0])

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [17]:
text = 'I work at J&L Consulting Ltd.'
doc = nlp(text)

In [21]:
for token in doc:
    print(token.text)
    print(token.pos_)

I
PRON
work
VERB
at
ADP
J&L
PROPN
Consulting
PROPN
Ltd.
PROPN


In [19]:
from spacy.matcher import Matcher

In [20]:
rule = [
    {'POS':'PROPN', 'OP':'+'},
    {'TEXT': 'Ltd.'}
]


matcher = Matcher(nlp.vocab)
matcher.add("ORG", [rule])
matches = matcher(doc)
print(matches)
for _, start, end in matches:
    print(doc[start:end])

[(383, 4, 6), (383, 3, 6)]
Consulting Ltd.
J&L Consulting Ltd.


In [23]:
from ipymarkup import show_box_markup

In [24]:
def i_to_idx(matches, doc):
    spans = set()
    for match in matches:
        start_i = match[1]
        end_i = match[2]
        span = doc[start_i:end_i]
        spans.add((span[0].idx, span[0].idx + len(span.text)))
    return spans

In [25]:
show_box_markup(doc.text, i_to_idx(matches, doc))