In [2]:
import spacy
nlp = spacy.load('en_core_web_sm') # load the english language library (load a model called nlp)
# this is actually a small language library

In [3]:
# PARTS OF SPEECH TAGGING

doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

for token in doc:
    print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

# pos_ is the coarse grain POS tag
# tag_ is the fine grain POS tag (more detail)

The        DET        DT         determiner
quick      ADJ        JJ         adjective 
brown      ADJ        JJ         adjective 
fox        NOUN       NN         noun, singular or mass
jumped     VERB       VBD        verb, past tense
over       ADP        IN         conjunction, subordinating or preposition
the        DET        DT         determiner
lazy       ADJ        JJ         adjective 
dog        NOUN       NN         noun, singular or mass
's         PART       POS        possessive ending
back       NOUN       NN         noun, singular or mass
.          PUNCT      .          punctuation mark, sentence closer


In [6]:
doc = nlp(u"I read books on NLP.") # read is in the present tense, and spacy knows this

token = doc[1]

print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

doc2 = nlp(u"I read a book on NLP.") # now, spacy understands that 'read' is in the past tense
                                     # I technically disagree with spacy because 'read' above can be past tense too

token = doc2[1]

print(f"{token.text:{10}} {token.pos_:{10}} {token.tag_:{10}} {spacy.explain(token.tag_):{10}}")

doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

POS_counts = doc.count_by(spacy.attrs.POS)

POS_counts # this basically shows how many times each POS happens (e.g., 3 verbs), but it has the numerical identifiers for each POS tag (e.g., '85' instead of 'verb')

for k,v in sorted(POS_counts.items()): # output the POS counts nicely
    print(f"{k}. {doc.vocab[k].text:{5}} {v}")

# we can repeat this same process with the fine grain POS tags by using the TAG attribute instead of the POS one

read       VERB       VBP        verb, non-3rd person singular present
read       VERB       VBD        verb, past tense
83. ADJ   3
84. ADP   1
89. DET   2
91. NOUN  3
93. PART  1
96. PUNCT 1
99. VERB  1


In [11]:
# VISUALIZING PART OF SPEECH

from spacy import displacy

doc = nlp(u'Apple will build Brandon\'s factory for the low low price of $56 billion.')

displacy.render(doc,style='dep',jupyter=True,
                    options={'distance':110,'compact':True,'color':'yellow','bg':'#006400','font':'Arial'})

In [None]:
# NAMED ENTITY RECOGNITION:

# CLASSIFY WORDS OR TOKENS INTO PRE-DEFINED CATEGORIES/ENTITIES

# WE CAN ADD OUR OWN CUSTOM ENTITIES!