In [2]:
import spacy

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'can', 'tell', 'would', 'shake', 'turn', 'talk', 'say']
Sebastian NORP
Google ORG
2007 DATE
American NORP
Recode ORG
earlier this week DATE


In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"displaCy uses JavaScript, SVG and CSS.")
spacy.displacy.serve(doc, style='dep')

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [4]:

from spacy.lang.en import English
nlp = English()
doc = nlp("hellow world")
for t in doc :
    print(t.text)
    print(t.vector)

hellow
[]
world
[]


In [None]:
from spacy.lang.es import Spanish

In [6]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("hellow world")
for t in doc :
    print(t.text)
    print(t.vector)

hellow
[-1.1371379  -1.9749116  -0.53645325 -3.2109115   1.8152902   4.1263747
  0.21082741  1.6221669   3.2469401   1.4039191   6.124388    1.9107443
  3.2682767  -1.3835092  -0.4910313   1.5196924  -1.3797178   2.7658281
 -2.616232   -3.0287812  -0.58697915 -1.6838125  -0.22886401  0.20042604
 -0.9066862  -2.0774417   0.01931107 -5.038466    1.1282427   0.07202721
  0.07329738 -0.02612376 -3.1624115  -1.5150045   0.8274615  -0.03308117
 -0.6634151   1.0509708  -2.553903    2.1382122   0.26934114  0.1648887
 -1.2732704  -0.9269384  -0.05518252  1.5297736  -2.6703012  -0.11279678
  0.39836437 -0.03538334 -0.07234037  0.8112725   0.1703046  -1.5282271
 -3.676477   -1.0999498   1.2729576   2.1082683   1.5595723  -1.9066247
  1.5081387  -2.586961   -1.7680396   2.2938964   1.9137545  -0.7093155
  3.2383482  -0.8457866  -0.20244236  3.1068826  -1.6256378  -0.5770912
 -0.9606688   0.9470196  -0.8506236   1.4809227   0.4629857   0.43661296
 -0.4679581   4.198527    1.1753109  -4.820634   -1.

In [7]:
doc = nlp('computer')
doc

computer

In [8]:
doc.vector.shape

(96,)

In [22]:
import spacy
nlp = spacy.load("en_core_web_lg")
doc = nlp('computer')
doc

computer

In [4]:
doc.vector.shape

(300,)

In [10]:
from scipy.spatial import distance

In [11]:
distance.cosine(doc.vector, doc.vector)

0.0

In [12]:
doc1 = nlp('France')
doc2 = nlp('Paris')
doc3 = nlp('England')
doc4 = nlp('London')

In [13]:
distance.cosine(doc1.vector, doc2.vector)

0.2396896481513977

In [14]:
distance.cosine(doc3.vector, doc4.vector)

0.26769328117370605

In [16]:
doc2.vector-doc1.vector

array([ 0.13717926,  2.8642626 , -0.46900517,  1.1681223 ,  0.2790904 ,
        0.5852642 , -0.8317218 , -2.6381965 ,  1.420256  ,  1.2092538 ,
        0.34929967,  0.780982  , -2.481535  , -0.41050363,  0.5083558 ,
       -1.5651579 ,  3.0892463 , -0.1755979 ,  1.495831  ,  1.9119772 ,
       -0.6928706 , -2.5545318 , -2.5622053 , -0.41803455,  1.865575  ,
        0.35601568, -1.0559455 ,  0.0677979 , -1.7226499 ,  1.0913198 ,
        0.6478702 ,  1.6644791 ,  1.0548083 ,  1.3530537 , -0.7770574 ,
        2.0276308 , -1.1549687 ,  0.84120977,  1.8364507 , -1.5380344 ,
       -0.3916645 , -0.6129689 ,  0.15785368,  0.9284959 ,  0.46233445,
       -0.57560825, -0.7006646 ,  1.4479146 , -0.56456465, -1.9107955 ,
        1.2332287 ,  0.25938118, -0.42328215,  2.0916927 , -0.7739557 ,
        0.28854012,  2.1855774 , -1.5980741 , -1.4473541 ,  2.0013998 ,
       -0.7189214 ,  0.54559475, -1.7186866 ,  1.107063  , -0.5381019 ,
        2.3564332 , -2.2301393 ,  3.9242356 , -2.992324  , -1.73

In [29]:
ks = nlp.vocab.vectors.keys()

In [30]:
[nlp.vocab.vectors[k] for k in ks]

[array([-0.21226  ,  0.61433  , -0.95848  , -0.38835  , -0.0030853,
        -0.77503  ,  0.33569  , -0.33492  ,  0.23969  , -0.18759  ,
         0.16702  ,  0.36635  ,  0.18127  ,  0.021859 ,  0.03728  ,
         0.29685  ,  0.40303  , -0.62576  ,  0.26542  ,  0.37626  ,
        -0.94858  ,  0.65036  , -0.42368  , -0.20075  ,  0.10983  ,
        -0.68955  , -0.2814   , -0.43937  ,  0.08264  , -0.40573  ,
         0.61688  ,  0.63868  ,  0.37528  , -0.040831 ,  0.32049  ,
         0.2247   , -0.59292  ,  0.75082  ,  0.085973 ,  0.19434  ,
         0.276    , -0.31703  , -1.4574   , -0.61571  ,  0.42585  ,
         0.22574  , -0.31286  , -0.51544  , -0.12606  ,  0.17625  ,
        -0.23576  ,  0.36249  ,  0.35335  ,  0.42397  , -0.095281 ,
         0.41381  ,  0.62414  , -0.373    , -0.36517  ,  0.22446  ,
        -0.097271 ,  1.2539   ,  0.94905  ,  0.36757  , -0.1177   ,
        -0.07325  ,  0.60325  , -0.77793  ,  0.10848  ,  0.21557  ,
        -0.35237  ,  0.53112  , -0.46085  ,  0.3

In [31]:
[k for k in ks]

[3424551750583975941,
 4645222075005403145,
 7289792013757579280,
 6550431784243298326,
 17436687061506588697,
 14945432585491185697,
 9919343951691644966,
 5799663568433446955,
 4887145912830263356,
 64,
 65,
 66,
 67,
 68,
 69,
 71,
 72,
 73,
 74,
 75,
 76,
 70,
 79,
 5357316086859563087,
 82,
 83,
 84,
 85,
 86,
 87,
 10836213096704376917,
 6788616644487807058,
 90,
 91,
 92,
 93,
 94,
 95,
 7459488032320651361,
 7786288115332677724,
 99,
 100,
 101,
 102,
 103,
 13620151657038872680,
 88,
 2966906362497335410,
 14920737513487204477,
 5748592392722710652,
 6533694349505462396,
 4721197940939423875,
 8845209893943115918,
 5285246301909287055,
 13113230508286804113,
 4674656215383408789,
 14833357556373520546,
 12244282449068032163,
 9907738424524144807,
 17277978563185737904,
 3815359466778394803,
 15903404572152955063,
 9137969348441276615,
 8903963634556731594,
 15408083569284743375,
 2516081072324739282,
 18146696659608797394,
 320859068509978837,
 7497485344611238104,
 1109556367

In [28]:
inv_wv = {nlp.vocab.vectors[k]:k for k in ks}

TypeError: unhashable type: 'numpy.ndarray'

In [32]:
vectors = Vectors()
vectors.from_glove("/path/to/glove_vectors")

NameError: name 'Vectors' is not defined

In [34]:
import spacy
from spacy.symbols import ORTH, LEMMA
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'I am flying to Frisco')
print([w.text for w in doc])
special_case = [{ORTH: u'Frisco', LEMMA: u'San Francisco'}]
nlp.tokenizer.add_special_case(u'Frisco', special_case)
print([w.lemma_ for w in nlp(u'I am flying to Frisco')])

['I', 'am', 'flying', 'to', 'Frisco']
['-PRON-', 'be', 'fly', 'to', 'San Francisco']
