In [1]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [2]:
doc = """Alice follows a large white rabbit down a "Rabbit-hole". She finds a tiny door. When she finds a bottle labeled "Drink me", she does, and shrinks, but not enough to pass through the door. She then eats something labeled "Eat me" and grows larger. She finds a fan when enables her to shrink enough to get into the "Garden" and try to get a "Dog" to play with her. She enters the "White Rabbit's tiny House," but suddenly resumes her normal size. In order to get out, she has to use the "magic fan."
She enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. "The Duchess's Cheshire Cat" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's "Mad Tea-Party." After a while, she leaves.
The Queen invites Alice to join the "ROYAL PROCESSION": a parade of marching playing cards and others headed by the White Rabbit. When Alice "unintentionally offends the Queen", the latter summons the "Executioner". Alice "boxes the ears", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream."""

In [3]:
spacy_doc = nlp(doc)
spacy_doc

Alice follows a large white rabbit down a "Rabbit-hole". She finds a tiny door. When she finds a bottle labeled "Drink me", she does, and shrinks, but not enough to pass through the door. She then eats something labeled "Eat me" and grows larger. She finds a fan when enables her to shrink enough to get into the "Garden" and try to get a "Dog" to play with her. She enters the "White Rabbit's tiny House," but suddenly resumes her normal size. In order to get out, she has to use the "magic fan."
She enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. "The Duchess's Cheshire Cat" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's "Mad Tea-Party." After a while, she leaves.
The Queen invites Alice to join the "ROYAL PROCESSION": a parade of marching playing ca

### Tokenization

In [5]:
tokens = list(spacy_doc)
print(tokens)

[Alice, follows, a, large, white, rabbit, down, a, ", Rabbit, -, hole, ", ., She, finds, a, tiny, door, ., When, she, finds, a, bottle, labeled, ", Drink, me, ", ,, she, does, ,, and, shrinks, ,, but, not, enough, to, pass, through, the, door, ., She, then, eats, something, labeled, ", Eat, me, ", and, grows, larger, ., She, finds, a, fan, when, enables, her, to, shrink, enough, to, get, into, the, ", Garden, ", and, try, to, get, a, ", Dog, ", to, play, with, her, ., She, enters, the, ", White, Rabbit, 's, tiny, House, ,, ", but, suddenly, resumes, her, normal, size, ., In, order, to, get, out, ,, she, has, to, use, the, ", magic, fan, ., ", 
, She, enters, a, kitchen, ,, in, which, there, is, a, cook, and, a, woman, holding, a, baby, ., She, persuades, the, woman, to, give, her, the, child, and, takes, the, infant, outside, after, the, cook, starts, throwing, things, around, ., The, baby, then, turns, into, a, pig, and, squirms, out, of, her, grip, ., ", The, Duchess, 's, Cheshire, C

In [6]:
for index,sentence in enumerate(spacy_doc.sents):
    print(index,sentence)

0 Alice follows a large white rabbit down a "Rabbit-hole".
1 She finds a tiny door.
2 When she finds a bottle labeled "Drink me", she does, and shrinks, but not enough to pass through the door.
3 She then eats something labeled "Eat me" and grows larger.
4 She finds a fan when enables her to shrink enough to get into the "Garden" and try to get a "Dog" to play with her.
5 She enters the "White Rabbit's tiny House," but suddenly resumes her normal size.
6 In order to get out, she has to use the "magic fan.
7 "

8 She enters a kitchen, in which there is a cook and a woman holding a baby.
9 She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around.
10 The baby then turns into a pig and squirms out of her grip.
11 "
12 The Duchess's Cheshire Cat" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's "Mad Tea-Party.
13 "
14 After a while, she leaves.

15 The Queen invites Alice to join the "ROYAL P

### Text Cleaning

In [7]:
first_word = tokens[0]
print (dir(first_word))

['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'ancestors', 'check_flag', 'children', 'cluster', 'conjuncts', 'dep', 'dep_', 'doc', 'ent_id', 'ent_id_', 'ent_iob', 'ent_iob_', 'ent_kb_id', 'ent_kb_id_', 'ent_type', 'ent_type_', 'get_extension', 'has_extension', 'has_vector', 'head', 'i', 'idx', 'is_alpha', 'is_ancestor', 'is_ascii', 'is_bracket', 'is_currency', 'is_digit', 'is_left_punct', 'is_lower', 'is_oov', 'is_punct', 'is_quote', 'is_right_punct', 'is_sent_start', 'is_space', 'is_stop', 'is_title', 'is_upper', 'lang', 'lang_', 'left_edge', 'lefts', 'lemma', 'lemma_', 'lex_id', 'like_email', 'like_num', 'like_url', 'lower', 'lower_', 'n_lefts', 'n_rights', 'nbor', 'nor

In [13]:
print("dep_:",tokens[0].dep_)

dep_: nsubj


### Parts of Speech Tagging

In [18]:
for sent in spacy_doc.sents:
    for token in sent:
        print("Token: ", token, "POS Tag:", token.pos_)
    break

Token:  Alice POS Tag: PROPN
Token:  follows POS Tag: VERB
Token:  a POS Tag: DET
Token:  large POS Tag: ADJ
Token:  white POS Tag: ADJ
Token:  rabbit POS Tag: NOUN
Token:  down POS Tag: ADP
Token:  a POS Tag: DET
Token:  " POS Tag: PUNCT
Token:  Rabbit POS Tag: NOUN
Token:  - POS Tag: PUNCT
Token:  hole POS Tag: PROPN
Token:  " POS Tag: PUNCT
Token:  . POS Tag: PUNCT


In [23]:
for ent in spacy_doc.ents:
    if ent.text.strip():
        print("Entity: ", ent.text, "Label: ", ent.label_,")")

Entity:  the "Garden" Label:  FAC )
Entity:  a "Dog Label:  WORK_OF_ART )
Entity:  the "White Rabbit's Label:  FAC )
Entity:  House Label:  ORG )
Entity:  The Duchess's Cheshire Cat Label:  WORK_OF_ART )
Entity:  Alice Label:  PERSON )
Entity:  the Mad Hatter's Label:  ORG )
Entity:  Mad Tea-Party Label:  WORK_OF_ART )
Entity:  Queen Label:  PERSON )
Entity:  Alice Label:  PERSON )
Entity:  the White Rabbit Label:  ORG )
Entity:  Alice Label:  PERSON )
Entity:  Queen Label:  PERSON )
Entity:  Executioner Label:  WORK_OF_ART )
Entity:  Alice Label:  PERSON )


SpaCy also provides a display rendering tool to visualize these entities and their labels 

In [24]:
spacy.displacy.render(spacy_doc, style='ent', jupyter=True)

### Noun Chunking

In [25]:
for idx, sentence in enumerate(spacy_doc.sents):
    for noun in sentence.noun_chunks:
        print(f"sentence {idx+1}, noun chunk '{noun}'")

sentence 1, noun chunk 'Alice'
sentence 1, noun chunk 'a large white rabbit'
sentence 1, noun chunk 'a "Rabbit-hole'
sentence 2, noun chunk 'She'
sentence 2, noun chunk 'a tiny door'
sentence 3, noun chunk 'she'
sentence 3, noun chunk 'a bottle'
sentence 3, noun chunk 'me'
sentence 3, noun chunk 'she'
sentence 3, noun chunk 'shrinks'
sentence 3, noun chunk 'the door'
sentence 4, noun chunk 'She'
sentence 4, noun chunk 'something'
sentence 4, noun chunk 'me'
sentence 5, noun chunk 'She'
sentence 5, noun chunk 'a fan'
sentence 5, noun chunk 'her'
sentence 5, noun chunk 'the "Garden'
sentence 5, noun chunk 'a "Dog'
sentence 5, noun chunk 'her'
sentence 6, noun chunk 'She'
sentence 6, noun chunk 'the "White Rabbit's tiny House'
sentence 6, noun chunk 'her normal size'
sentence 7, noun chunk 'order'
sentence 7, noun chunk 'she'
sentence 7, noun chunk 'the "magic fan'
sentence 9, noun chunk 'She'
sentence 9, noun chunk 'a kitchen'
sentence 9, noun chunk 'a cook'
sentence 9, noun chunk 'a wom