In [1]:
import spacy
nlp = spacy.load('en_core_web_sm') # load the english language library (load a model called nlp)
# this is actually a small language library

In [2]:
def tokenize(doc):
    for token in doc:
        print(token.text,token.pos_,token.dep_) # token attributes (e.g. the part of speech)

# reads in the unicode string and parses the sentence into tokens (individual words)
doc = nlp(u'My name is Brandon and I have to write a card for my birthday.') # doc holds the processed text
# nlp() -> the text enters a "processing pipeline," which breaks down the text and conducts a series of operations on it

# tokenization:
doc2 = nlp(u"Tesla's stock is dropping. Oh the horror!")
tokenize(doc2)
# can access specific tokens in the doc object with indexing (doc2[0])

# we can slice the processed string and take certain tokens (e.g. doc[0:3])
# spacy will convert this to a Span object because we are taking a range of the tokens

# spacy can automatically seperate the string based on sentences
for sentence in doc2.sents:
    print(sentence)

Tesla PROPN poss
's PART case
stock NOUN nsubj
is VERB aux
dropping VERB ROOT
. PUNCT punct
Oh INTJ intj
the DET det
horror NOUN ROOT
! PUNCT punct
Tesla's stock is dropping.
Oh the horror!


In [12]:
# tokenization is breaking up the sentence into smaller components (e.g. words)
# this is the fundamental process for understanding the sentence
# we split by whitespace, then remove characters in the beginning/end, then look at special characters

# /' is the escape character to have an apostrophe in the string

example = '"This is an example for Brandon\'s NLP Task"'
doc = nlp(example)

'''
for token in doc:
    print(token)
'''

# the Doc object is immutable: we cannot reassign or replace any of its tokens

# NAMED ENTITY RECOGNITION:

doc2 = nlp(u'Apple will build Brandon\'s factory for the low low price of $56 billion.')

'''
for token in doc2: # just the regular printing of tokens
    print(token.text,end=' | ')
'''

for entity in doc2.ents: # spacy recognizes the important tokens in the sentence
    print(entity)
    print(entity.label_) # spacy even knows the type of word each entity is (person, place, thing)!!!
    print(str(spacy.explain(entity.label_)))
    print('')

for chunk in doc2.noun_chunks: # noun chunks (combines the adjectives and the noun that they modify into one noun chunk)
    print(chunk)

Apple
ORG
Companies, agencies, institutions, etc.

Brandon
PERSON
People, including fictional

$56 billion
MONEY
Monetary values, including unit

Apple
Brandon's factory
the low low price


In [22]:
from spacy import displacy

# VISUALIZE TOKENIZATION!! This is very similar to the syntax tree that Dylan mentioned the other day

doc = nlp(u'Apple will build Brandon\'s awesome factory for the low low price of $56 billion.')

displacy.render(doc,style='dep',jupyter=True,options={'distance':110}) # distance is the distance between the tokens in the output

In [20]:
# this displacy is different from the previous one on tokens
# it focuses only on the named entities (person, place, thing)

displacy.render(doc,style='ent',jupyter=True) # distance is the distance between the tokens in the output

In [None]:
# HOWEVER, if we want to visualize the tokenization OUTSIDE of jupyter notebook (like a python script), we need the following:

'''
doc = nlp(u'Input your sentence here.')
displacy.serve(doc,style='dep')
'''

In [None]:
# STEMMING: When we have a base (stem) word and we find variations of that word
# e.g. if table is our stem, then the variations are tables, tabled, table-ing

