In [1]:
import spacy
import thinc
import numpy
from spacy.pipeline import Tagger

In [2]:
spacy.__version__

'2.0.9'

In [3]:
nlp = spacy.load('en')

### First Example / short

In [4]:
doc = nlp(u'This is a good sentence.')
print([s for s in doc.sents])
nlp.create_pipe('sentencizer')

[This is a good sentence.]


<spacy.pipeline.SentenceSegmenter at 0x7f09f86bc5c0>

In [4]:
tagger = Tagger(nlp.vocab)
doc = nlp(u"This is a sentence.")

In [5]:
doc

This is a sentence.

### Second Example / long

In [17]:
nlp = spacy.load('en_vectors_web_lg')

In [20]:
print(nlp.create_pipe.__doc__)

Create a pipeline component from a factory.

        name (unicode): Factory name to look up in `Language.factories`.
        config (dict): Configuration parameters to initialise component.
        RETURNS (callable): Pipeline component.
        


In [19]:
print(nlp.add_pipe.__doc__)

Add a component to the processing pipeline. Valid components are
        callables that take a `Doc` object, modify it and return it. Only one
        of before/after/first/last can be set. Default behaviour is "last".

        component (callable): The pipeline component.
        name (unicode): Name of pipeline component. Overwrites existing
            component.name attribute if available. If no name is set and
            the component exposes no name attribute, component.__name__ is
            used. An error is raised if a name already exists in the pipeline.
        before (unicode): Component name to insert component directly before.
        after (unicode): Component name to insert component directly after.
        first (bool): Insert component first / not first in the pipeline.
        last (bool): Insert component last / not last in the pipeline.

        EXAMPLE:
            >>> nlp.add_pipe(component, before='ner')
            >>> nlp.add_pipe(component, name='custom_nam

In [18]:
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [21]:
print(nlp.vocab.__doc__)

A look-up table that allows you to access `Lexeme` objects. The `Vocab`
    instance also provides access to the `StringStore`, and owns underlying
    C-data that is shared between `Doc` objects.
    


In [30]:
## 1070971, 300
nlp.vocab.vectors.data.shape

(1070971, 300)

In [31]:
embeddings = nlp.vocab.vectors.data

In [35]:
imdb_data = thinc.extra.datasets.imdb()

In [39]:
type(imdb_data)

tuple

In [41]:
imdb_data[0][1]

("Sure, we all like bad movies at one time or another, and we in fact enjoy them, This however, wasn't even a guilty pleasure, it was just crap. Some guy, vince offer, who is conceited enough to make himself the main character while probably got drunk/high--probably both--and thought it was a great idea to make a movie. He then proceeded to show his script to equally high/drunk individuals. Overall, this movie was so bad, predictable, and unoriginal I couldn't get through 20 minutes of it before I turned it off. It makes You Got Served look like Citizen Kane. Bat Man? WTF...Some guy that walks around with a bat, real original. Almost as good as calling him Fat Man, and having a fat guy walk around in a superhero outfit.",
 0)

In [49]:
print(len(imdb_data))
print(len(imdb_data[0]))
print(len(imdb_data[1]))
print(imdb_data[0][1][1])

2
25000
25000
0


In [51]:
dev_texts, dev_labels = zip(*imdb_data[1])

In [54]:
print(len(dev_texts))
print(len(dev_labels))

25000
25000


In [56]:
def get_features(docs, max_length):
    docs = list(docs)
    Xs = numpy.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
        j = 0
        for token in doc:
            vector_id = token.vocab.vectors.find(key=token.orth)
            if vector_id >= 0:
                Xs[i, j] = vector_id
            else:
                Xs[i, j] = 0
            j += 1
            if j >= max_length:
                break
    return Xs

In [60]:
#xs = get_features(dev_texts,5)

In [62]:
import requests
from spacy.tokens import Token, Span
from spacy.matcher import PhraseMatcher

class Countries(object):
    name = 'countries'  # component name shown in pipeline

    def __init__(self, nlp, label='GPE'):
        # request all country data from the API
        r = requests.get('https://restcountries.eu/rest/v2/all')
        self.countries = {c['name']: c for c in r.json()}  # create dict for easy lookup
        # initialise the matcher and add patterns for all country names
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add('COUNTRIES', None, *[nlp(c) for c in self.countries.keys()])
        self.label = nlp.vocab.strings[label] # get label ID from vocab
        # register extensions on the Token
        Token.set_extension('is_country', default=False)
        Token.set_extension('country_capital')
        Token.set_extension('country_latlng')

    def __call__(self, doc):
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # create Span for matched country and assign label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            for token in entity:  # set values of token attributes
                token._.set('is_country', True)
                token._.set('country_capital', self.countries[entity.text]['capital'])
                token._.set('country_latlng', self.countries[entity.text]['latlng'])
        doc.ents = list(doc.ents) + spans  # overwrite doc.ents and add entities – don't replace!
        for span in spans:
            span.merge()  # merge all spans at the end to avoid mismatched indices
        return doc  # don't forget to return the Doc!

In [63]:
nlp = spacy.load('en')
component = Countries(nlp)
nlp.add_pipe(component, before='tagger')
doc = nlp(u"Some text about Colombia and the Czech Republic")

print([(ent.text, ent.label_) for ent in doc.ents])
# [('Colombia', 'GPE'), ('Czech Republic', 'GPE')]

print([(token.text, token._.country_capital) for token in doc if token._.is_country])
# [('Colombia', 'Bogotá'), ('Czech Republic', 'Prague')]

[('Colombia', 'GPE'), ('Czech Republic', 'GPE')]
[('Colombia', 'Bogotá'), ('Czech Republic', 'Prague')]


In [70]:
type(component)

__main__.Countries

In [65]:
print(doc)

Some text about Colombia and the Czech Republic


In [69]:
doc.vector.shape

(384,)