First things first. Make sure that Spacy is intalled in your system, If not use:
```
pip install spacy
```
Next, download a small sized english model trained on web.

In [1]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 7.7 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [2]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
doc = nlp("I like this weather")

In [5]:
from spacy import displacy

In [6]:
# For using Displacy in a local port we can use:
# displacy.server(doc, style="dep")
# For using Displacy from spacy in google colab use:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

### Custom Style

In [7]:
doc = nlp("This is a sentence in compact mode with custom styles.")
options = {"compact": True, "bg": "#09d5d4", "color": "orange", "font": "verdana"}
displacy.render(doc, style='dep', jupyter=True, options=options)

### Saving as a file

In [8]:
from pathlib import Path

doc = nlp("I'm a butterfly.")
svg = displacy.render(doc, style="dep", jupyter=False)

filename = "butterfly.svg"
output_path = Path(filename)

with open (output_path, "w", encoding="utf-8") as file:
  file.write(svg)

### Saving as HTML

In [9]:
doc1 = nlp("I own a ginger cat.")
doc2 = nlp("He is very pretty.")

html = displacy.render([doc1, doc2], style="dep", page=True)
print(html)

<!DOCTYPE html>
<html lang="en">
    <head>
        <title>displaCy</title>
    </head>

    <body style="font-size: 16px; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; padding: 4rem 2rem; direction: ltr">
<figure style="margin-bottom: 6rem">
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="en" id="20c6329f828c40e49878644fed3fbaab-0" class="displacy" width="925" height="399.5" direction="ltr" style="max-width: none; height: 399.5px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr">
<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
    <tspan class="displacy-word" fill="currentColor" x="50">I</tspan>
    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">PRON</tspan>
</text>

<text class="displacy-token" fill="currentColor" text-anchor="middle" y="309.5">
    <tspan class="dis

In [10]:
displacy.render([doc1, doc2], style="dep", jupyter=True)

### Displaying Entities

In [11]:
doc = nlp("Bill Gates is the CEO of Microsoft.")
displacy.render(doc, style="ent", jupyter=True)

### Custom coloring Entities

In [12]:
sentence = "Sony was leading the consumer music devices sector not so long ago before he lost it to Apple. By birth of music platforms such SoundCloud and Spotify, Sony lost the music battle completely. Over the last quarter, Apple sold 20.000 iPods for a profit of $5 million. Whereas Sony was able to sell only 5.000 Walkman music players."

doc = nlp(sentence)
colors = {"ORG": "linear-gradient(326deg, #a4508b, #5f0a87)", "PRODUCT": "radial-gradient(yellow, green)"}
options = {"ents": ["ORG", "PRODUCT"], "colors": colors}
displacy.render(doc, style="ent", options=options, jupyter=True)

### Tokenization

In [13]:
doc1 = nlp("I own a Ferrari")
doc2 = nlp("It's been a crazy week!!!") # with punctuations

In [14]:
print([token.text for token in doc1])

['I', 'own', 'a', 'Ferrari']


In [15]:
print([token.text for token in doc2])

['It', "'s", 'been', 'a', 'crazy', 'week', '!', '!', '!']


### Customizing Tokens

In [16]:
from spacy.symbols import ORTH

doc = nlp("lemme that")
print([w.text for w in doc])

['lemme', 'that']


In [17]:
special_case = [{ORTH: "lem"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("lemme", special_case)
print([w.text for w in nlp("lemme that")])

['lem', 'me', 'that']


In [18]:
print([w.text for w in nlp("lemme!!")])

['lem', 'me', '!', '!']


In [19]:
nlp.tokenizer.add_special_case("lemme!!", [{ORTH:"lemme!!"}])

In [20]:
# Takes precedence over the punctuation splitting
print([w.text for w in nlp("lemme!!")])

['lemme!!']


### Debugging Tokenizer

In [21]:
text = "Let's go!"
token_exp = nlp.tokenizer.explain(text)
for t in token_exp:
  print(t[1], "\t", t[0])

Let 	 SPECIAL-1
's 	 SPECIAL-2
go 	 TOKEN
! 	 SUFFIX


### Lemmatization

In [22]:
import pandas as pd
doc = nlp("I went for working and worked for 3 years.")
tokens, lemma = [], []
for token in doc:
    tokens.append(token.text)
    lemma.append(token.lemma_)
pd = pd.DataFrame({"Token":tokens, "Lemma":lemma})
pd

Unnamed: 0,Token,Lemma
0,I,-PRON-
1,went,go
2,for,for
3,working,working
4,and,and
5,worked,work
6,for,for
7,3,3
8,years,year
9,.,.


In [23]:
from spacy.symbols import ORTH, LEMMA

special_case = [{ORTH: "Angeltown", LEMMA:"Los Angels"}]
nlp.tokenizer.add_special_case("Angeltown", special_case)
doc = nlp("I'm flying to Angeltown")

In [24]:
for token in doc:
  print(token.text, "\t", token.lemma_)

I 	 -PRON-
'm 	 be
flying 	 fly
to 	 to
Angeltown 	 Los Angels


### Spacy Container Objects

### Doc

In [25]:
doc = nlp("I like cats")
doc.text

'I like cats'

In [26]:
for token in doc:
  print(token.text)

I
like
cats


In [27]:
len(doc)

3

In [28]:
print(doc[1])

like


In [29]:
# gets text's sentences
doc2 = nlp("This is a sentence. This is the second sentence.")
doc2.sents

<generator at 0x7fbc457a7af0>

In [30]:
sentences = list(doc2.sents)
print(sentences)

[This is a sentence., This is the second sentence.]


In [31]:
# gets named entities
doc3 = nlp("I flied to New York with Ashley")
print(doc3.ents)

(New York,)


In [32]:
doc4 = nlp("Sweet brown fox jumped over the fence.")
print(list(doc4.noun_chunks))
print(doc4.lang_)

[Sweet brown fox, the fence]
en


In [33]:
# serialization
doc = nlp("Hi")
json_doc = doc.to_json()
json_doc

{'ents': [],
 'sents': [{'end': 2, 'start': 0}],
 'text': 'Hi',
 'tokens': [{'dep': 'ROOT',
   'end': 2,
   'head': 0,
   'id': 0,
   'pos': 'INTJ',
   'start': 0,
   'tag': 'UH'}]}

**NOTE**:  
`doc.lang_` returns unicode string of the language where as `doc.lang` returns the language ID. Similar conventions for `token.lemma_ , token.tag_ , token.pos_`.  

### Token

In [34]:
doc = nlp("Hi Madam!")

for token in doc:
  print(token.text, token.text_with_ws, len(token), token.idx, token.doc, token.sent)

Hi Hi  2 0 Hi Madam! Hi Madam!
Madam Madam 5 3 Hi Madam! Hi Madam!
! ! 1 8 Hi Madam! Hi Madam!


In [35]:
doc = nlp("He entered the room. Then he nodded.")

In [36]:
print(list(doc.sents))

[He entered the room., Then he nodded.]


In [37]:
doc[0].is_sent_start

True

In [38]:
doc[5].is_sent_start

True

In [39]:
doc[6].is_sent_start

In [40]:
doc = nlp("I went there.")
doc[1].lemma_

'go'

In [41]:
doc = nlp("President Trump visited Mexico City")

In [42]:
doc.ents

(Trump, Mexico City)

In [43]:
for i in range(len(doc)):
  print(doc[i], "\t", doc[i].ent_type_)

President 	 
Trump 	 PERSON
visited 	 
Mexico 	 GPE
City 	 GPE


In [44]:
# All features and methods
dir(doc)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_bulk_merge',
 '_py_tokens',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_disk',
 'get_extension',
 'get_lca_matrix',
 'has_extension',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'merge',
 'noun_chunks',
 'noun_chunks_iterator',
 'print_tree',
 'remove_extension',
 'retokenize',
 'sentiment',
 'sents',
 'set_extension',
 'similarity',
 'tensor',
 'text',
 'text_with_ws',
 'to_array',
 'to_byte

### Span

In [45]:
doc = nlp("I know that you have been to USA.")
print(doc[2:4])

that you


In [46]:
doc2 =  nlp("President Trump visited Mexico City.")
print(doc2[4:])
print(doc2[3:-1]) # 3 to rest of the sentence

City.
Mexico City


In [47]:
doc3 = nlp("You love Atlanta since you're 20.")
print(doc3.char_span(4, 16))

love Atlanta


In [48]:
doc4 = nlp("You went there after you saw me.")
span = doc4[2:4]
print(span)
for token in span:
    print(token)

print(len(span))
print(span.sent)
print(span.doc)
print("Start:",span.start)
print("End:",span.end)
print("StartChar:",span.start_char)
print("EndChar:",span.end_char)

there after
there
after
2
You went there after you saw me.
You went there after you saw me.
Start: 2
End: 4
StartChar: 9
EndChar: 20


### More Spacy features

In [49]:
doc = nlp("Hello, hi!")
print(doc[0].lower_)

hello


In [50]:
doc1 = nlp("HELLO, Hello, hello, hEllO")
print(doc1[0].is_upper)
print(doc1[0].is_lower)
print(doc1[1].is_upper)
print(doc1[1].is_lower)

True
False
False
False


In [51]:
doc2 = nlp("Cat! and Cat123")
print(doc2[0].is_alpha)
print(doc2[1].is_alpha)
print(doc2[2].is_alpha)
print(doc2[3].is_alpha)

True
False
True
False


In [52]:
doc3 = nlp("Hamburg and Göttingen")
print(doc3[0].is_ascii)
print(doc3[2].is_ascii)

True
False


In [53]:
doc4 = nlp("Cat Cat123 123")
print(doc4[0].is_digit)
print(doc4[1].is_digit)
print(doc4[2].is_digit)

False
False
True


In [54]:
doc5 = nlp("You, him and Sally")
print(doc5[1])
print(doc5[1].is_punct)

,
True


In [55]:
doc6 = nlp(" ")
print(doc6[0])
print(len(doc6[0]))
print(doc6[0].is_space)

 
1
True


In [56]:
doc7 = nlp("I paid 12$ for the t-shirt.")
print(doc7[3])
print(doc7[3].is_currency)

$
True


In [57]:
doc8 = nlp("I emailed you at least 100 times")
print(doc8[-2])
print(doc8[-2].like_num)

100
True


In [58]:
doc9 = nlp("I emailed you at least hundred times")
print(doc9[-2])
print(doc9[-2].like_num)


hundred
True


In [59]:
doc10 = nlp("My email is duygu@packt.com and you can visit me under https://duygua.github.io any time you want.")
print(doc10[3])
print(doc10[3].like_email)
print(doc10[10])
print(doc10[10].like_url)


duygu@packt.com
True
https://duygua.github.io
True


In [60]:
for token in doc10:
  if token.like_email or token.like_url:
    print(token.text)

duygu@packt.com
https://duygua.github.io


In [61]:
doc11 = nlp("Girl called Kathy has a nickname Cat123.")
for token in doc11:
    print(token.text, token.shape_)

Girl Xxxx
called xxxx
Kathy Xxxxx
has xxx
a x
nickname xxxx
Cat123 Xxxddd
. .
