# Tokenization

A document has many sentences, when we tokenize it word by word it's called **word tokenization** and when we're tokenizing sentences it's sentence tokenization.

We can perform tokenization in many ways:
- Using python's inbuilt methods
- RegEx
- NLTK library
- SpaCy
- Keras to build NN


## Using in-built methods

In [2]:
text = """As dawn broke over the sleepy village, a gentle breeze stirred the leaves, whispering secrets to the awakening world. The scent of dew-kissed grass hung in the air, mingling with the aroma of freshly brewed coffee wafting from the local café. Birds chirped joyfully, heralding the start of a new day, while the sun peeked over the horizon, casting a golden glow upon the cobblestone streets. In the distance, the silhouette of a lone figure could be seen, ambling along the winding path with purposeful strides. Each step seemed to echo the rhythm of the village, a harmonious melody of life unfolding with every passing moment. As the day unfolded, the village embraced the promise of adventure and possibility, basking in the beauty of the present moment."""

word = text.split()
sent = text.split(".")
print('Word Token: ', word)
print('\n')
print('Sentence Token: ', sent)



Word Token:  ['As', 'dawn', 'broke', 'over', 'the', 'sleepy', 'village,', 'a', 'gentle', 'breeze', 'stirred', 'the', 'leaves,', 'whispering', 'secrets', 'to', 'the', 'awakening', 'world.', 'The', 'scent', 'of', 'dew-kissed', 'grass', 'hung', 'in', 'the', 'air,', 'mingling', 'with', 'the', 'aroma', 'of', 'freshly', 'brewed', 'coffee', 'wafting', 'from', 'the', 'local', 'café.', 'Birds', 'chirped', 'joyfully,', 'heralding', 'the', 'start', 'of', 'a', 'new', 'day,', 'while', 'the', 'sun', 'peeked', 'over', 'the', 'horizon,', 'casting', 'a', 'golden', 'glow', 'upon', 'the', 'cobblestone', 'streets.', 'In', 'the', 'distance,', 'the', 'silhouette', 'of', 'a', 'lone', 'figure', 'could', 'be', 'seen,', 'ambling', 'along', 'the', 'winding', 'path', 'with', 'purposeful', 'strides.', 'Each', 'step', 'seemed', 'to', 'echo', 'the', 'rhythm', 'of', 'the', 'village,', 'a', 'harmonious', 'melody', 'of', 'life', 'unfolding', 'with', 'every', 'passing', 'moment.', 'As', 'the', 'day', 'unfolded,', 'the',

## Using RegEx

In [3]:
import re

word = re.findall('[\w]+', text)
sent = re.compile('[.]').split(text)

print("word Token: ", word)
print('\n')
print('Sentence Token: ', sent)

word Token:  ['As', 'dawn', 'broke', 'over', 'the', 'sleepy', 'village', 'a', 'gentle', 'breeze', 'stirred', 'the', 'leaves', 'whispering', 'secrets', 'to', 'the', 'awakening', 'world', 'The', 'scent', 'of', 'dew', 'kissed', 'grass', 'hung', 'in', 'the', 'air', 'mingling', 'with', 'the', 'aroma', 'of', 'freshly', 'brewed', 'coffee', 'wafting', 'from', 'the', 'local', 'café', 'Birds', 'chirped', 'joyfully', 'heralding', 'the', 'start', 'of', 'a', 'new', 'day', 'while', 'the', 'sun', 'peeked', 'over', 'the', 'horizon', 'casting', 'a', 'golden', 'glow', 'upon', 'the', 'cobblestone', 'streets', 'In', 'the', 'distance', 'the', 'silhouette', 'of', 'a', 'lone', 'figure', 'could', 'be', 'seen', 'ambling', 'along', 'the', 'winding', 'path', 'with', 'purposeful', 'strides', 'Each', 'step', 'seemed', 'to', 'echo', 'the', 'rhythm', 'of', 'the', 'village', 'a', 'harmonious', 'melody', 'of', 'life', 'unfolding', 'with', 'every', 'passing', 'moment', 'As', 'the', 'day', 'unfolded', 'the', 'village', 

## Using NLTK

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anant\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize
word = word_tokenize(text)
sent = sent_tokenize(text)

print('Word Token: ', word )
print('\n')
print('Sentence Token: ', sent)

Word Token:  ['As', 'dawn', 'broke', 'over', 'the', 'sleepy', 'village', ',', 'a', 'gentle', 'breeze', 'stirred', 'the', 'leaves', ',', 'whispering', 'secrets', 'to', 'the', 'awakening', 'world', '.', 'The', 'scent', 'of', 'dew-kissed', 'grass', 'hung', 'in', 'the', 'air', ',', 'mingling', 'with', 'the', 'aroma', 'of', 'freshly', 'brewed', 'coffee', 'wafting', 'from', 'the', 'local', 'café', '.', 'Birds', 'chirped', 'joyfully', ',', 'heralding', 'the', 'start', 'of', 'a', 'new', 'day', ',', 'while', 'the', 'sun', 'peeked', 'over', 'the', 'horizon', ',', 'casting', 'a', 'golden', 'glow', 'upon', 'the', 'cobblestone', 'streets', '.', 'In', 'the', 'distance', ',', 'the', 'silhouette', 'of', 'a', 'lone', 'figure', 'could', 'be', 'seen', ',', 'ambling', 'along', 'the', 'winding', 'path', 'with', 'purposeful', 'strides', '.', 'Each', 'step', 'seemed', 'to', 'echo', 'the', 'rhythm', 'of', 'the', 'village', ',', 'a', 'harmonious', 'melody', 'of', 'life', 'unfolding', 'with', 'every', 'passing'

## Using Spacy

In [5]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
word = []
for t in doc:
    word.append(t.text)

sent = []
for t in doc.sents:
    sent.append(t.text)

print('Word Token: ', word)
print('\n')
print('Sentence Token: ', sent)

Word Token:  ['As', 'dawn', 'broke', 'over', 'the', 'sleepy', 'village', ',', 'a', 'gentle', 'breeze', 'stirred', 'the', 'leaves', ',', 'whispering', 'secrets', 'to', 'the', 'awakening', 'world', '.', 'The', 'scent', 'of', 'dew', '-', 'kissed', 'grass', 'hung', 'in', 'the', 'air', ',', 'mingling', 'with', 'the', 'aroma', 'of', 'freshly', 'brewed', 'coffee', 'wafting', 'from', 'the', 'local', 'café', '.', 'Birds', 'chirped', 'joyfully', ',', 'heralding', 'the', 'start', 'of', 'a', 'new', 'day', ',', 'while', 'the', 'sun', 'peeked', 'over', 'the', 'horizon', ',', 'casting', 'a', 'golden', 'glow', 'upon', 'the', 'cobblestone', 'streets', '.', 'In', 'the', 'distance', ',', 'the', 'silhouette', 'of', 'a', 'lone', 'figure', 'could', 'be', 'seen', ',', 'ambling', 'along', 'the', 'winding', 'path', 'with', 'purposeful', 'strides', '.', 'Each', 'step', 'seemed', 'to', 'echo', 'the', 'rhythm', 'of', 'the', 'village', ',', 'a', 'harmonious', 'melody', 'of', 'life', 'unfolding', 'with', 'every', '

### Basics of Spacy 

In [7]:
for t in doc:
    print(t,"\nis_alpha:",t.is_alpha,
         "\nis_punct:",t.is_punct,
         "\nlike_num:",t.like_num,
         "\nis_currency:",t.is_currency,"\n")

As 
is_alpha: True 
is_punct: False 
like_num: False 
is_currency: False 

dawn 
is_alpha: True 
is_punct: False 
like_num: False 
is_currency: False 

broke 
is_alpha: True 
is_punct: False 
like_num: False 
is_currency: False 

over 
is_alpha: True 
is_punct: False 
like_num: False 
is_currency: False 

the 
is_alpha: True 
is_punct: False 
like_num: False 
is_currency: False 

sleepy 
is_alpha: True 
is_punct: False 
like_num: False 
is_currency: False 

village 
is_alpha: True 
is_punct: False 
like_num: False 
is_currency: False 

, 
is_alpha: False 
is_punct: True 
like_num: False 
is_currency: False 

a 
is_alpha: True 
is_punct: False 
like_num: False 
is_currency: False 

gentle 
is_alpha: True 
is_punct: False 
like_num: False 
is_currency: False 

breeze 
is_alpha: True 
is_punct: False 
like_num: False 
is_currency: False 

stirred 
is_alpha: True 
is_punct: False 
like_num: False 
is_currency: False 

the 
is_alpha: True 
is_punct: False 
like_num: False 
is_currency: Fals

## Using Tensorflow/Keras

In [6]:
from keras.preprocessing.text import text_to_word_sequence
word=text_to_word_sequence(text)
sent=text_to_word_sequence(text,split='.')
print('word token',word)
print('\n')
print('sentence token:',sent)

word token ['as', 'dawn', 'broke', 'over', 'the', 'sleepy', 'village', 'a', 'gentle', 'breeze', 'stirred', 'the', 'leaves', 'whispering', 'secrets', 'to', 'the', 'awakening', 'world', 'the', 'scent', 'of', 'dew', 'kissed', 'grass', 'hung', 'in', 'the', 'air', 'mingling', 'with', 'the', 'aroma', 'of', 'freshly', 'brewed', 'coffee', 'wafting', 'from', 'the', 'local', 'café', 'birds', 'chirped', 'joyfully', 'heralding', 'the', 'start', 'of', 'a', 'new', 'day', 'while', 'the', 'sun', 'peeked', 'over', 'the', 'horizon', 'casting', 'a', 'golden', 'glow', 'upon', 'the', 'cobblestone', 'streets', 'in', 'the', 'distance', 'the', 'silhouette', 'of', 'a', 'lone', 'figure', 'could', 'be', 'seen', 'ambling', 'along', 'the', 'winding', 'path', 'with', 'purposeful', 'strides', 'each', 'step', 'seemed', 'to', 'echo', 'the', 'rhythm', 'of', 'the', 'village', 'a', 'harmonious', 'melody', 'of', 'life', 'unfolding', 'with', 'every', 'passing', 'moment', 'as', 'the', 'day', 'unfolded', 'the', 'village', 'e