# Tokenization using NLTK library

In [32]:
corpus="""Welcome to Manish Pandey's world of Python programming. Here, you will find various resources and tutorials to help you learn Python effectively. 
Whether you are a beginner or an experienced programmer, there is something for everyone.
Feel free to explore the content and enhance your coding skills. Happy coding!
"""

In [41]:
##Paragraph---> Sentences
from nltk.tokenize import sent_tokenize
sentences=sent_tokenize(corpus)
print(sentences)
sentences


["Welcome to Manish Pandey's world of Python programming.", 'Here, you will find various resources and tutorials to help you learn Python effectively.', 'Whether you are a beginner or an experienced programmer, there is something for everyone.', 'Feel free to explore the content and enhance your coding skills.', 'Happy coding!']


["Welcome to Manish Pandey's world of Python programming.",
 'Here, you will find various resources and tutorials to help you learn Python effectively.',
 'Whether you are a beginner or an experienced programmer, there is something for everyone.',
 'Feel free to explore the content and enhance your coding skills.',
 'Happy coding!']

In [None]:
#paragraph--->words
#sentences--->words
# every word ( including punctuations) is considered as a token

from nltk.tokenize import word_tokenize
Words=word_tokenize(corpus)
print(Words)
Words

['Welcome', 'to', 'Manish', 'Pandey', "'s", 'world', 'of', 'Python', 'programming', '.', 'Here', ',', 'you', 'will', 'find', 'various', 'resources', 'and', 'tutorials', 'to', 'help', 'you', 'learn', 'Python', 'effectively', '.', 'Whether', 'you', 'are', 'a', 'beginner', 'or', 'an', 'experienced', 'programmer', ',', 'there', 'is', 'something', 'for', 'everyone', '.', 'Feel', 'free', 'to', 'explore', 'the', 'content', 'and', 'enhance', 'your', 'coding', 'skills', '.', 'Happy', 'coding', '!']


['Welcome',
 'to',
 'Manish',
 'Pandey',
 "'s",
 'world',
 'of',
 'Python',
 'programming',
 '.',
 'Here',
 ',',
 'you',
 'will',
 'find',
 'various',
 'resources',
 'and',
 'tutorials',
 'to',
 'help',
 'you',
 'learn',
 'Python',
 'effectively',
 '.',
 'Whether',
 'you',
 'are',
 'a',
 'beginner',
 'or',
 'an',
 'experienced',
 'programmer',
 ',',
 'there',
 'is',
 'something',
 'for',
 'everyone',
 '.',
 'Feel',
 'free',
 'to',
 'explore',
 'the',
 'content',
 'and',
 'enhance',
 'your',
 'coding',
 'skills',
 '.',
 'Happy',
 'coding',
 '!']

In [36]:
from nltk.tokenize import word_tokenize
for word in sentences:
    print(word_tokenize(word))

['Welcome', 'to', 'Manish', 'Pandey', "'s", 'world', 'of', 'Python', 'programming', '.']
['Here', ',', 'you', 'will', 'find', 'various', 'resources', 'and', 'tutorials', 'to', 'help', 'you', 'learn', 'Python', 'effectively', '.']
['Whether', 'you', 'are', 'a', 'beginner', 'or', 'an', 'experienced', 'programmer', ',', 'there', 'is', 'something', 'for', 'everyone', '.']
['Feel', 'free', 'to', 'explore', 'the', 'content', 'and', 'enhance', 'your', 'coding', 'skills', '.']
['Happy', 'coding', '!']


In [None]:
#apostrophe is also considered as a separate token

from nltk.tokenize import wordpunct_tokenize
for word in sentences:
    print(wordpunct_tokenize(word))

['Welcome', 'to', 'Manish', 'Pandey', "'", 's', 'world', 'of', 'Python', 'programming', '.']
['Here', ',', 'you', 'will', 'find', 'various', 'resources', 'and', 'tutorials', 'to', 'help', 'you', 'learn', 'Python', 'effectively', '.']
['Whether', 'you', 'are', 'a', 'beginner', 'or', 'an', 'experienced', 'programmer', ',', 'there', 'is', 'something', 'for', 'everyone', '.']
['Feel', 'free', 'to', 'explore', 'the', 'content', 'and', 'enhance', 'your', 'coding', 'skills', '.']
['Happy', 'coding', '!']


In [None]:
#full stop in middle sentence is treated as 1 token ( check programming.) but at the end of sentence is treated as separate token
from nltk.tokenize import TreebankWordTokenizer
tokenizer=TreebankWordTokenizer()
tokenizer.tokenize(corpus)

['Welcome',
 'to',
 'Manish',
 'Pandey',
 "'s",
 'world',
 'of',
 'Python',
 'programming.',
 'Here',
 ',',
 'you',
 'will',
 'find',
 'various',
 'resources',
 'and',
 'tutorials',
 'to',
 'help',
 'you',
 'learn',
 'Python',
 'effectively.',
 'Whether',
 'you',
 'are',
 'a',
 'beginner',
 'or',
 'an',
 'experienced',
 'programmer',
 ',',
 'there',
 'is',
 'something',
 'for',
 'everyone.',
 'Feel',
 'free',
 'to',
 'explore',
 'the',
 'content',
 'and',
 'enhance',
 'your',
 'coding',
 'skills.',
 'Happy',
 'coding',
 '!']

# Understanding NLTK Tokenizer Classes

There are three main ways to use tokenizers in NLTK:

1. Function-based tokenizers (like `word_tokenize`):
   - Direct functions you can call
   - Created new internally each time
   - Example: `word_tokenize(text)`

2. Class-based tokenizers (like `TreebankWordTokenizer`):
   - Need to create an instance first
   - Reuse the same instance multiple times
   - More efficient for multiple tokenizations
   - Example: `tokenizer = TreebankWordTokenizer(); tokenizer.tokenize(text)`

3. Regular expression based tokenizers:
   - Customizable with your own patterns
   - Need to be instantiated like class-based tokenizers

In [None]:
#here full stop is at the end of sentence so treated as separate token
from nltk.tokenize import TreebankWordTokenizer
tokenizer=TreebankWordTokenizer()
for i in sentences:
    print(tokenizer.tokenize(i))

['Welcome', 'to', 'Manish', 'Pandey', "'s", 'world', 'of', 'Python', 'programming', '.']
['Here', ',', 'you', 'will', 'find', 'various', 'resources', 'and', 'tutorials', 'to', 'help', 'you', 'learn', 'Python', 'effectively', '.']
['Whether', 'you', 'are', 'a', 'beginner', 'or', 'an', 'experienced', 'programmer', ',', 'there', 'is', 'something', 'for', 'everyone', '.']
['Feel', 'free', 'to', 'explore', 'the', 'content', 'and', 'enhance', 'your', 'coding', 'skills', '.']
['Happy', 'coding', '!']
