In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
# Sample sentence.
s = "He didn't want to pay $20 for this book."
doc = nlp(s)

In [3]:
# We can iterate over this **Doc** object and view the tokens.

print([t.text for t in doc])

# Note how
# - "didn't" is separated into "did"  and "n't".
# - the currency symbol and amount are separated.
# - the period at the end of the sentence is its own token.

['He', 'did', "n't", 'want', 'to', 'pay', '$', '20', 'for', 'this', 'book', '.']


In [4]:
# A Doc object is a container of other objects, namely Token and Span objects.
print(type(doc[0]))


<class 'spacy.tokens.token.Token'>


In [5]:
# Access a token's index in a sentence.
print([(t.text, t.i) for t in doc])

[('He', 0), ('did', 1), ("n't", 2), ('want', 3), ('to', 4), ('pay', 5), ('$', 6), ('20', 7), ('for', 8), ('this', 9), ('book', 10), ('.', 11)]


In [6]:
# Slicing a Doc object returns a Span object.
print(doc[0:3])
print(type(doc[0:3]))

He didn't
<class 'spacy.tokens.span.Span'>


In [7]:
# You can view the original input like so:
print(doc.text)

He didn't want to pay $20 for this book.


In [8]:
s = """Either the well was very deep, or she fell very slowly, for she
had plenty of time as she went down to look about her and to wonder what
was going to happen next. First, she tried to look down and make out what
she was coming to, but it was too dark to see anything; then she looked at
the sides of the well, and noticed that they were filled with cupboards and
book-shelves; here and there she saw maps and pictures hung upon pegs."""

doc = nlp(s)

# Look at individual sentences (there should be two 'Span' objects).
print([sent for sent in doc.sents])

[Either the well was very deep, or she fell very slowly, for she 
had plenty of time as she went down to look about her and to wonder what 
was going to happen next., First, she tried to look down and make out what 
she was coming to, but it was too dark to see anything; then she looked at 
the sides of the well, and noticed that they were filled with cupboards and 
book-shelves; here and there she saw maps and pictures hung upon pegs.]


# NLTK 

In [9]:
from nltk.tokenize import sent_tokenize, word_tokenize

text = ("Natural language processing (NLP) is a field "
       "of computer science, artificial intelligence "
       "and computational linguistics concerned with "
       "the interactions between computers and human "
       "(natural) languages, and, in particular, "
       "concerned with programming computers to "
       "fruitfully process large natural language "
       "corpora. Challenges in natural language "
       "processing frequently involve natural "
       "language understanding, natural language"
       "generation frequently from formal, machine"
       "-readable logical forms), connecting language "
       "and machine perception, managing human-"
       "computer dialog systems, or some combination "
       "thereof.")

print(sent_tokenize(text))
print(word_tokenize(text))

['Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora.', 'Challenges in natural language processing frequently involve natural language understanding, natural languagegeneration frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof.']
['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', 'and', 'computational', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', ',', 'and', ',', 'in', 'particular', ',', 'concerned', 'with', 'programming', 'computers', 'to'

In [10]:
word = "i don't wanna go out today. it's not good MR.amal!"

In [11]:
from nltk.tokenize import (word_tokenize,
                           TreebankWordDetokenizer,
                           wordpunct_tokenize,
                           TweetTokenizer,
                           MWETokenizer)

# word tokenizer
print(word_tokenize(word))

# wordpunct_tokenize
# This tokenizer splits the sentences into words based on whitespaces and punctuations.
print("===========wordpunct_tokenize==========")
print(wordpunct_tokenize(word))


# TreebankWordDetokenizer
print("===========TreebankWordDetokenizer==========")
print(TreebankWordDetokenizer().tokenize(word))


# When we want to apply tokenization in text data like tweets, the tokenizers mentioned above can’t produce practical tokens.
# Through this issue, NLTK has a rule based tokenizer special for tweets. We can split emojis into different words if we need
# them for tasks like sentiment analysis.
print("===========TweetTokenizer==========")
print(TweetTokenizer().tokenize(word))



# NLTK’s multi-word expression tokenizer (MWETokenizer) provides a function add_mwe() that allows
# the user to enter multiple word expressions before using the tokenizer on the text.
# More simply, it can merge multi-word expressions into single tokens.

print("=========== MWETokenizer ==========")
m = MWETokenizer()
m.add_mwe(("don't", "wanna"))
print(m.tokenize(word_tokenize(word)))   # takes input as list of words not sentence.

['i', 'do', "n't", 'wan', 'na', 'go', 'out', 'today', '.', 'it', "'s", 'not', 'good', 'MR.amal', '!']
['i', 'don', "'", 't', 'wanna', 'go', 'out', 'today', '.', 'it', "'", 's', 'not', 'good', 'MR', '.', 'amal', '!']
i   d o n' t   w a n n a   g o   o u t   t o d a y .   i t' s   n o t   g o o d   M R . a m a l!
['i', "don't", 'wanna', 'go', 'out', 'today', '.', "it's", 'not', 'good', 'MR.amal', '!']
['i', 'do', "n't", 'wan', 'na', 'go', 'out', 'today', '.', 'it', "'s", 'not', 'good', 'MR.amal', '!']
