In [79]:
import spacy

# Tokenization using spacy.blank("")

In [80]:
nlp = spacy.blank("en")

doc = nlp("Dr. Dtrange loves pav bhaji of mumbai as it coasts only 2$ per plate.")

for token in doc:
    print(token)

Dr.
Dtrange
loves
pav
bhaji
of
mumbai
as
it
coasts
only
2
$
per
plate
.


# Span Object

In [82]:
span = doc[0:5]
print(span)
print(type(span))

Dr. Dtrange loves pav bhaji
<class 'spacy.tokens.span.Span'>


# Using index and attributes  to grab tokens

In [74]:
doc = nlp("Tony gave two $ to Peter.")

In [75]:
token0 = doc[0]
print(token0)
print(type(token0))
print(token0.like_num)
print(dir(token0))

Tony
<class 'spacy.tokens.token.Token'>
False
['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'ancestors', 'check_flag', 'children', 'cluster', 'conjuncts', 'dep', 'dep_', 'doc', 'ent_id', 'ent_id_', 'ent_iob', 'ent_iob_', 'ent_kb_id', 'ent_kb_id_', 'ent_type', 'ent_type_', 'get_extension', 'has_dep', 'has_extension', 'has_head', 'has_morph', 'has_vector', 'head', 'i', 'idx', 'iob_strings', 'is_alpha', 'is_ancestor', 'is_ascii', 'is_bracket', 'is_currency', 'is_digit', 'is_left_punct', 'is_lower', 'is_oov', 'is_punct', 'is_quote', 'is_right_punct', 'is_sent_end', 'is_sent_start', 'is_space', 'is_stop', 'is_title', 'is_upper', 'lang', 'lang_', 'left_edge', 'lefts', 'lemma', 

In [76]:
token2 = doc[2]
print(token2.text)
print(token2.like_num)

two
True


In [77]:
token3 = doc[3]
print(token3)
print(token3.is_currency)

$
True


In [78]:
for token in doc:
    print(token, "==>", "index: ", token.i,
          "is_alpha: ", token.is_alpha,
          "is_punch: ", token.is_punct,
          "like_num: ", token.like_num,
          "is_currency: ", token.is_currency,)

Tony ==> index:  0 is_alpha:  True is_punch:  False like_num:  False is_currency:  False
gave ==> index:  1 is_alpha:  True is_punch:  False like_num:  False is_currency:  False
two ==> index:  2 is_alpha:  True is_punch:  False like_num:  True is_currency:  False
$ ==> index:  3 is_alpha:  False is_punch:  False like_num:  False is_currency:  True
to ==> index:  4 is_alpha:  True is_punch:  False like_num:  False is_currency:  False
Peter ==> index:  5 is_alpha:  True is_punch:  False like_num:  False is_currency:  False
. ==> index:  6 is_alpha:  False is_punch:  True like_num:  False is_currency:  False


# Collecting email ids of students from student informations sheet

In [44]:
with open("students.txt") as f:
    text = f.readlines()
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirthday\temail\n',
 '-----\t--------\t-----\n',
 'Virat\t5 June, 1882\tvirat@kohli.com\n',
 'Maria\t12 April, 2001\tmaria@sharapova.com\n',
 'Serena\t24 June, 1998\tserena@williams.com\n',
 'Joe\t1 May, 1997\tjoe@root.com']

In [45]:
text = ' '.join(text)
text



In [46]:
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

# Support in other Language

In [53]:
nlp = spacy.blank("bn")

doc = nlp("আমি তাকে ২ হাজার ৳ দিয়েছিলাম, সে তা ফেরত দেয়নি।")

for token in doc:
    print(token, token.is_currency, token.like_num)

আমি False False
তাকে False False
২ False True
হাজার False False
৳ True False
দিয়েছিলাম False False
, False False
সে False False
তা False False
ফেরত False False
দেয়নি False False
। False False


In [64]:
nlp = spacy.blank("en")

doc = nlp("gimme double cheese extra large healthy pizza")
token = [token.text for token in doc]
token

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

# Customize Tokenizer

In [65]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"}
])

doc = nlp("gimme double cheese extra large healthy pizza")
token = [token.text for token in doc]
token

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

# Sentence Tokenization or Segmentation adding pipe 'sentencizer' manually

In [68]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x24c58bc84c0>

In [70]:
nlp.pipe_names

['sentencizer']

In [71]:
doc = nlp("Dr. Strange loves fuchka in Dhaka. Hulk loves puri in Sylhet")

for sentence in doc.sents:
    print(sentence)

Dr. Strange loves fuchka in Dhaka.
Hulk loves puri in Sylhet
