In [2]:
import spacy
nlp = spacy.load('en_core_web_sm') # load the english language library (load a model called nlp)
# this is actually a small language library

In [2]:
def tokenize(doc):
    for token in doc:
        print(token.text,token.pos_,token.dep_) # token attributes (e.g. the part of speech)

# reads in the unicode string and parses the sentence into tokens (individual words)
doc = nlp(u'My name is Brandon and I have to write a card for my birthday.') # doc holds the processed text
# nlp() -> the text enters a "processing pipeline," which breaks down the text and conducts a series of operations on it

# tokenization:
doc2 = nlp(u"Tesla's stock is dropping. Oh the horror!")
tokenize(doc2)
# can access specific tokens in the doc object with indexing (doc2[0])

# we can slice the processed string and take certain tokens (e.g. doc[0:3])
# spacy will convert this to a Span object because we are taking a range of the tokens

# spacy can automatically seperate the string based on sentences
for sentence in doc2.sents:
    print(sentence)

Tesla PROPN poss
's PART case
stock NOUN nsubj
is VERB aux
dropping VERB ROOT
. PUNCT punct
Oh INTJ intj
the DET det
horror NOUN ROOT
! PUNCT punct
Tesla's stock is dropping.
Oh the horror!


In [6]:
# tokenization is breaking up the sentence into smaller components (e.g. words)
# this is the fundamental process for understanding the sentence
# we split by whitespace, then remove characters in the beginning/end, then look at special characters

# /' is the escape character to have an apostrophe in the string

example = '"This is an example for Brandon\'s NLP Task"'
doc = nlp(example)

'''
for token in doc:
    print(token)
'''

# the Doc object is immutable: we cannot reassign or replace any of its tokens

# NAMED ENTITY RECOGNITION:

doc2 = nlp(u'Apple will build Brandon\'s horrible factory for the low low price of $56 billion.')

'''
for token in doc2: # just the regular printing of tokens
    print(token.text,end=' | ')
'''

for entity in doc2.ents: # spacy recognizes the important tokens in the sentence
    print(entity)
    print(entity.label_) # spacy even knows the type of word each entity is (person, place, thing)!!!
    print(str(spacy.explain(entity.label_)))
    print('')

for chunk in doc2.noun_chunks: # noun chunks (combines the adjectives and the noun that they modify into one noun chunk)
    print(chunk)

Apple
ORG
Companies, agencies, institutions, etc.

Brandon
PERSON
People, including fictional

$56 billion
MONEY
Monetary values, including unit

Apple
Brandon's horrible factory
the low low price


In [3]:
from spacy import displacy

# VISUALIZE TOKENIZATION!! This is very similar to the syntax tree that Dylan mentioned the other day

doc = nlp(u'Apple will build Brandon\'s factory for the low low price of $56 billion.')

displacy.render(doc,style='dep',jupyter=True,options={'distance':110}) # distance is the distance between the tokens in the output

In [4]:
# this displacy is different from the previous one on tokens
# it focuses only on the named entities (person, place, thing)

displacy.render(doc,style='ent',jupyter=True) # distance is the distance between the tokens in the output

In [None]:
# HOWEVER, if we want to visualize the tokenization OUTSIDE of jupyter notebook (like a python script), we need the following:

'''
doc = nlp(u'Input your sentence here.')
displacy.serve(doc,style='dep')
'''

In [3]:
# STEMMING: When we have a base (stem) word and we use it to find variations of that word
# e.g. if table is our stem, then its variations are tables, tabled, table-ing
# But stemming is a pretty basic method because we just remove letters from the end of the word until we reach the stem
# So spacy uses lemmatization instead

# LEMMATIZATION:

doc = nlp(u"I am a player playing in a play because I love to play and played yesterday.")

for token in doc:
    # "lemma" -> the hash (a sequence of digits) for the lemma or true stem
    # "lemma_" -> the actual lemma or stem text
    print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}') # use f-strings in order to format the output properly

# STOP WORDS:
# Note: we can add or remove stop words from the nlp vocabulary
# Note: we can access a specific word in the nlp vocabulary with nlp.vocab['word']

I            PRON   561228191312463089     -PRON-
am           VERB   10382539506755952630   be
a            DET    11901859001352538922   a
player       NOUN   3885285634180617021    player
playing      VERB   8228585124152053988    play
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
play         NOUN   8228585124152053988    play
because      ADP    16950148841647037698   because
I            PRON   561228191312463089     -PRON-
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
play         VERB   8228585124152053988    play
and          CCONJ  2283656566040971221    and
played       VERB   8228585124152053988    play
yesterday    NOUN   1756787072497230782    yesterday
.            PUNCT  12646065887601541794   .


In [7]:
# PHRASE MATCHING WITH TOKENS

from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab) # pass in the current vocabulary of the nlp object

# Define possible patterns for the phrase "solar power" with tokens
# Each dictionary in the lists below corresponds to one token

# Solarpower
pattern1 = [{'LOWER':'solarpower'}] # if we make the token lowercase, then is it "solarpower"?

# Solar-power
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}] # if we make the first token lowercase, is it "solar"? 
# is the second token a form of punctuation (like a hyphen)? if we make the third token lowercase, is it "power"?

# Solar power
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}] # if we make the first token lowercase, is it "solar"? 
# if we make the second token lowercase, is it "power"?

# LOWER is the specific attribute, and we can use other attributes

matcher.add('SolarPower',None,pattern1,pattern2,pattern3) # we add the 3 patterns to the matching object, under the heading "SolarPower"

doc = nlp(u"The Solar Power industry is amazing. Solarpower continues to increase and we need solar-power objects.")

found_matches = matcher(doc) # find the matches in the doc string

# print(found_matches)

# this specific for-loop is from the course documentation (a way to print out the matches)
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

# if we want to adjust the patterns in the matcher object, just remove them

matcher.remove('SolarPower')

# better patterns for the phrase "Solarpower"
pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}] # for the second token, we check if it's punctuation. 
# it can be any number of punctuation symbols (e.g., '--' or ' ') because of the 'OP' parameter

matcher.add('SolarPower',None,pattern1,pattern2) # we add the 2 patterns to the matching object, under the heading "SolarPower"

doc2 = nlp(u"What is the solar--power that is necessary to have the SolarPower?")

found_matches = matcher(doc2)

print("New Pattern Matching:")

# this specific for-loop is from the course documentation (a way to print out the matches)
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc2[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 7 8 Solarpower
8656102463236116519 SolarPower 14 17 solar-power
New Pattern Matching:
8656102463236116519 SolarPower 3 6 solar--power
8656102463236116519 SolarPower 12 13 SolarPower


In [7]:
# NEW FORM OF PATTERN MATCHING, USING PHRASES INSTEAD OF TOKENS:

from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

with open('nlp_course_notes/TextFiles/reaganomics.txt') as f: # there seems to be an issue with reading in this file
    doc = nlp(f.read())

# ECONOMICS MATCHER
phrase_list = ['voodoo economics','supply-side economics','trickle-down economics']

phrase_patterns = [nlp(txt) for txt in phrase_list] # convert each phrase in the list into a spacy doc string

matcher.add('EconomicsMatcher',None,*phrase_patterns) # we need * here because it will pass in each doc in phrase_patterns as a seperate parameter

found_matches = matcher(doc)

for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

7040560306600519277 EconomicsMatcher 41 45 supply-side economics
7040560306600519277 EconomicsMatcher 49 53 trickle-down economics
7040560306600519277 EconomicsMatcher 54 56 voodoo economics
7040560306600519277 EconomicsMatcher 673 677 supply-side economics
7040560306600519277 EconomicsMatcher 2988 2992 trickle-down economics
