In [11]:
import spacy 
from spacy import displacy
from pathlib import Path

# Chapter One

### Using displacy in jupyter notebooks to render nlp

In [5]:
nlp = spacy.load('en_core_web_md')
doc = nlp('The quick fox jumps over the small dog')
# displacy.serve(doc, style='dep')
displacy.render(doc, style='dep')

In [7]:
doc = nlp('Bill Gates is the CEO of Microsoft.')
# displacy.serve(doc, style='ent')
displacy.render(doc, style='ent')

### Saving displacy renders

In [10]:
doc = nlp("I'm a butterfly.'")
svg = displacy.render(doc, style='dep', jupyter=False)
filename = 'butterfly.svg'
output_path = Path(filename)
output_path.open('w',encoding='utf-8').write(svg)

3023

# Chapter Two

### Introducing tokenization

In [12]:
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp("I own a ginger cat.")
print ([token.text for token in doc])

['I', 'own', 'a', 'ginger', 'cat', '.']


### Customizing the tokenizer

In [13]:
import spacy
from spacy.symbols import ORTH

nlp = spacy.load('en_core_web_md')
doc = nlp('lemme that')
print([w.text for w in doc])

['lemme', 'that']


In [15]:
special_case = [{ORTH: "lem"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("lemme", special_case)
print([w.text for w in nlp("lemme that")])

['lem', 'me', 'that']


In [16]:
# Special cases take precedence over punctuation splitting
print([w.text for w in nlp("...lemme...?")])
nlp.tokenizer.add_special_case("...lemme...?", [{"ORTH": "...lemme...?"}])
print([w.text for w in nlp("...lemme...?")])

['...', 'lem', 'me', '...', '?']
['...lemme...?']


### Debugging the tokenizer

In [17]:
import spacy
nlp = spacy.load("en_core_web_md")
text = "Let's go!"
doc = nlp(text)

In [18]:
tok_exp = nlp.tokenizer.explain(text)
for t in tok_exp:
    print(t[1], "\t", t[0])

Let 	 SPECIAL-1
's 	 SPECIAL-2
go 	 TOKEN
! 	 SUFFIX


### Sentence segmentation

In [19]:
nlp = spacy.load("en_core_web_md")
text = "I flied to N.Y yesterday. It was around 5 pm."
doc = nlp(text)
for sentence in doc.sents:
    print(sentence.text)

I flied to N.Y yesterday.
It was around 5 pm.


### Understanding lemmatization

In [20]:
nlp = spacy.load("en_core_web_md")
doc = nlp("I went there for working and worked for 3 years.")
for token in doc:
    print(token.text, token.lemma_)

I I
went go
there there
for for
working work
and and
worked work
for for
3 3
years year
. .


In [23]:
# Special rules can be added to capture nicknames
nlp = spacy.load("en_core_web_md")
nlp.get_pipe("attribute_ruler").add([[{"TEXT": "Angeltown"}]], {"LEMMA": "Los Angeles"})
for token in doc:
    print(token.text, token.lemma_)

I I
went go
there there
for for
working work
and and
worked work
for for
3 3
years year
. .


### spaCY container objects

In [24]:
# Tokens
doc = nlp("I like cats")
for token in doc:
    print(token.text)

I
like
cats


In [25]:
# Sents
doc = nlp("This is a sentence. This is the second sentence")
sentences = list(doc.sents)
sentences

[This is a sentence., This is the second sentence]

In [26]:
# Ents
doc = nlp("I flied to New York with Ashley.")
doc.ents

(New York, Ashley)

In [27]:
# Noun chunks
doc = nlp("Sweet brown fox jumped over the fence.")
list(doc.noun_chunks)

[Sweet brown fox, the fence]

In [29]:
# JSON Conversion
doc = nlp("Hi")
json_doc = doc.to_json()
json_doc

{'text': 'Hi',
 'ents': [],
 'sents': [{'start': 0, 'end': 2}],
 'tokens': [{'id': 0,
   'start': 0,
   'end': 2,
   'tag': 'UH',
   'pos': 'INTJ',
   'morph': '',
   'lemma': 'hi',
   'dep': 'ROOT',
   'head': 0}]}

In [30]:
# Spans
doc = nlp("You love Atlanta since you're 20.")
doc.char_span(4,16)

love Atlanta

In [31]:
# iterating over a span
doc = nlp("You went there after you saw me")
span = doc[2:4]
for token in span:
    print(token)

there
after


### Additional features

In [None]:
# Token shape features (similar to native python)
doc = nlp("Hello, hi!")

doc[0].is_upper
doc[0].is_lower
doc[0].is_alpha
doc[0].is_ascii
doc[0].is_digit
doc[0].is_punct
doc[0].is_left_punct # [
doc[0].is_right_punct # ]
doc[0].is_space
doc[0].is_bracket
doc[0].is_quote
doc[0].is_currency # $ 
doc[0].like_url # https://
doc[0].like_num # 100
doc[0].like_email # name@email.com
doc[0].is_oov # unknown words to doc vocabulary
doc[0].is_stop # a, an, and, just, with, the


In [32]:
# Shape_ returns orthographic features
doc = nlp("Girl called Kathy has a nickname Cat123.")
for token in doc:
    print(token.text, token.shape_)

Girl Xxxx
called xxxx
Kathy Xxxxx
has xxx
a x
nickname xxxx
Cat123 Xxxddd
. .
