### Installation

We follow the instructions from https://spacy.io/docs/usage/

In [3]:
!sudo -H python3 -m pip install -U spacy

Collecting spacy
Requirement already up-to-date: six in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: msgpack-numpy in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: dill<0.3,>=0.2 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: cymem<1.32,>=1.30 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: preshed<2.0.0,>=1.0.0 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: thinc<6.11.0,>=6.10.1 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: numpy>=1.7 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: plac<1.0.0,>=0.9.6 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: msgpack-python in /

### Install Models for various languages

See https://spacy.io/usage/models

In [4]:
# Support for english
!sudo python3 -m spacy download en_core_web_lg

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz (852.3MB)
[K    100% |████████████████████████████████| 852.3MB 49.8MB/s ta 0:00:013   40% |█████████████                   | 345.7MB 84.9MB/s eta 0:00:06    49% |███████████████▉                | 421.2MB 86.0MB/s eta 0:00:06 |████████████████████▊           | 550.6MB 87.1MB/s eta 0:00:04MB 69.9MB/s eta 0:00:049.9MB/s eta 0:00:03█████▎         | 593.5MB 89.2MB/s eta 0:00:03█████▋         | 602.8MB 73.6MB/s eta 0:00:04��██████         | 613.0MB 102.6MB/s eta 0:00:03��████████████████▍        | 621.6MB 82.0MB/s eta 0:00:03��███████████████▌        | 626.1MB 80.0MB/s eta 0:00:03�██████████████████▏      | 671.7MB 78.9MB/s eta 0:00:0300:0303 0:00:03   | 693.5MB 75.4MB/s eta 0:00:03 697.0MB 78.3MB/s eta 0:00:02�█████▍     | 704.2MB 89.2MB/s eta 0:00:0

### Starting with Spacy

We first import the library and create an `nlp` variable, instantiated for English (`'en'`).

In [2]:
import spacy

# Load the space library, instantiated for English
#note: the first time you run spaCy in a file it takes a little while to load up its modules
nlp = spacy.load('en_core_web_lg') 

AttributeError: module 'spacy' has no attribute 'load'

From https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

In [None]:
text = """There is an art, it says, or rather, a knack to flying. 
The knack lies in learning how to throw yourself at the ground and miss.
In the beginning the Universe was created. This has made a lot of people
very angry and been widely regarded as a bad move.
This Prof. Panos, Ph.D. costs $12,345.67"""

In [None]:
# all you have to do to parse text is this:
doc = nlp(text)

In [None]:
tokens = [token for token in doc]
tokens[:10]

In [None]:
# Let's look at the tokens
# All you have to do is iterate through the doc
# Each token is an object with lots of different properties
# A property with an underscore at the end returns the string representation
# while a property without the underscore returns an index (int) into spaCy's vocabulary
# The probability estimate is based on counts from a 3 billion word corpus
for i, token in enumerate(doc):
    print("original:", token.orth, token.orth_)
    print("lowercased:", token.lower, token.lower_)
    print("lemma:", token.lemma, token.lemma_)
    print("shape:", token.shape, token.shape_)
    print("prefix:", token.prefix, token.prefix_)
    print("suffix:", token.suffix, token.suffix_)
    print("part of speech:", token.pos_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")
    if i > 1:
        break

#### Get some data

First let's get a few text files, so that we can run our examples.

In [None]:
!mkdir data
!curl -L 'https://raw.githubusercontent.com/cytora/pycon-nlp-in-10-lines/master/data/article.txt' -o data/article.txt
!curl -L 'https://raw.githubusercontent.com/cytora/pycon-nlp-in-10-lines/master/data/pride_and_prejudice.txt' -o data/pride_and_prejudice.txt
!curl -L 'https://raw.githubusercontent.com/cytora/pycon-nlp-in-10-lines/master/data/rand-terrorism-dataset.txt'  -o data/rand-terrorism-dataset.txt

Now we will read the text file and then we will use the `nlp` object from spacy to analyze the text.

In [None]:
filename = "data/article.txt"
text = open(filename, 'r').read()
doc = nlp(text)

##### Print tokens

In [None]:
# Print tokens, one token per line
# The enumerate function is just used to add a counter
for token in doc:
    print(token)

###### Print Sentences

In [None]:
# Print the first 5 sentences (one sentence per line)
# The enumerate function is just used to add a counter
for i, sent in enumerate(doc.sents):
    print(i, "==>", sent)
    if i>5:
        break

#### Named Entities 

In [None]:
entities = set([ent.lemma_ for ent in doc.ents])
entities

In [None]:
doc = nlp(text)

In [None]:
entities_with_type = set([ent.lemma_+" # "+ent.label_ for ent in doc.ents ])
entities_with_type

In [None]:
organizations = set([ent.lemma_+" # "+ent.label_ for ent in doc.ents if ent.label_=='ORG' ])
organizations

#### Noun chunks

In [None]:
chunks = [chunk.lemma_ for chunk in doc.noun_chunks if chunk.lemma_ not in entities]
chunks

In [None]:
from collections import Counter

keywords = Counter()
for chunk in chunks:
    # print(chunk, nlp.vocab[chunk].prob )
    if nlp.vocab[chunk].prob < -8: # probablity value -8 is arbitrarily selected threshold
        keywords[chunk] += 1

keywords.most_common(20)

### Vector Embeddings

In [None]:
for ent1 in doc.ents:
    for ent2 in doc.ents:
        similarity = ent1.similarity(ent2)
        if similarity > 0.5:
            print('{} - {} - {}' .format(ent1, ent2, similarity))

### Vector Embeddings Calculations

In [None]:
from numpy import dot
from numpy.linalg import norm

# cosine similarity
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

# Let's see if it can figure out this analogy
# B is to A as C is to ???
a = nlp.vocab['London']
b = nlp.vocab['UK']
c = nlp.vocab['France']

# a = nlp.vocab['Knicks']
# b = nlp.vocab['New York']
# c = nlp.vocab['Boston']

result = a.vector - b.vector + c.vector

# gather all known words, take only the lowercased versions
allWords = list({w for w in nlp.vocab if w.has_vector and w.is_title and w.lower_ not in set({a.lower_,b.lower_,c.lower_})})
# sort by similarity to the result
allWords.sort(key=lambda w: cosine(w.vector, result))
allWords.reverse()
print("\n----------------------------\nTop 3 closest results:")
for word in allWords[:3]:   
    print(word.orth_)
    
# it got it! 