##### Library Installation

In [3]:
pip install nltk spacy textblob -U

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import nltk

In [6]:
nltk.download('punkt') # tokenization
nltk.download('stopwords') # stopwords removal
nltk.download('averaged_perceptron_tagger') # POS tagging
nltk.download('wordnet') # wordnet database and lemmatization
nltk.download('omw-1.4') # stemming
nltk.download('indian') # Indian language POS tagging
nltk.download('maxent_ne_chunker') # chunking

[nltk_data] Downloading package punkt to /home/mitu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mitu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mitu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/mitu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mitu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package indian to /home/mitu/nltk_data...
[nltk_data]   Package indian is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/mitu/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

##### Sample Example

In [8]:
sent = 'They told that their ages are 25, 27 and 31 respectively.'

In [9]:
# Find the average of ages mentioned in the above sentence.

In [11]:
ages = []
for word in sent.split():
    if word.isdigit():
        ages.append(int(word))

In [13]:
sum(ages) / len(ages)

27.666666666666668

In [15]:
ages = [int(word) for word in sent.split() if word.isdigit()]
sum(ages) / len(ages)

27.666666666666668

In [16]:
import numpy as np

In [17]:
np.mean([int(word) for word in sent.split() if word.isdigit()])

27.666666666666668

#### Tokenization

In [19]:
sent = 'Hello friends! How are you? Welcome to Python Programming.'

In [20]:
# import the functions
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [21]:
# segmentation
sent_tokenize(sent)

['Hello friends!', 'How are you?', 'Welcome to Python Programming.']

In [22]:
word_tokenize(sent)

['Hello',
 'friends',
 '!',
 'How',
 'are',
 'you',
 '?',
 'Welcome',
 'to',
 'Python',
 'Programming',
 '.']

In [23]:
# Find percentage of punctuation symbols present in it.

In [31]:
punct_count = len([word for word in word_tokenize(sent) if not word.isalnum()])

In [32]:
punct_count / len(word_tokenize(sent))

0.25

In [33]:
ord('y')

121

In [34]:
ord('#')

35

In [37]:
import sys
sys.getsizeof('=')

50

In [38]:
help(sys.getsizeof)

Help on built-in function getsizeof in module sys:

getsizeof(...)
    getsizeof(object [, default]) -> int
    
    Return the size of object in bytes.



In [41]:
char = 'abcdefghih'
sys.getsizeof(char)

59

In [46]:
chr(2067)

'ࠓ'

In [49]:
char = '\u0935'

In [50]:
print(char)

व


In [51]:
char = '\u0935\u0940'
char

'वी'

In [52]:
ord('व')

2357

In [53]:
chr(2358)

'श'

In [54]:
chr(0x935)

'व'

In [55]:
sys.getsizeof('व')

76

In [56]:
name = 'तुषार कुटे'

In [57]:
name.split()

['तुषार', 'कुटे']

In [58]:
name.startswith('त')

True

In [59]:
name.replace('तु','तू')

'तूषार कुटे'

In [60]:
name.find('ष')

2

In [61]:
len(name)

10

In [62]:
names = ['प्रणव', 'अमेय', 'प्रतीक्षा', 'मिताली', 'ऋतुजा', 'रूथ', 'श्वेता']

In [63]:
for name in names:
    if name.startswith('प्र'):
        print(name)

प्रणव
प्रतीक्षा


In [64]:
mtext = '४८ कि. मी. अंतरावर आणि पुणे जिल्ह्यातील वेल्हे तालुक्यात व भोर गावाच्या वायव्येला २४ कि.मी. अंतरावर नीरा-वेळवंडी-कानंदी आणि गुंजवणी या नद्यांच्या खोऱ्यांच्या बेचक्यात मुरुंबदेवाचा डोंगर उभा आहे. मावळ भागामध्ये राज्यविस्तार साध्य करण्यासाठी राजगड आणि तोरणा हे दोन्ही किल्ले मोक्याच्या ठिकाणी होते. तोरणा Archived 2020-09-20 at the Wayback Machine. किल्ल्याचा बालेकिल्ला आकाराने लहान असल्यामुळे राजकीय केंद्र म्हणून हा किल्ला सोयीचा नव्हता. त्यामानाने राजगड दुर्गम असून त्याचा बालेकिल्ला बराच मोठा आहे.'

In [65]:
mtext

'४८ कि. मी. अंतरावर आणि पुणे जिल्ह्यातील वेल्हे तालुक्यात व भोर गावाच्या वायव्येला २४ कि.मी. अंतरावर नीरा-वेळवंडी-कानंदी आणि गुंजवणी या नद्यांच्या खोऱ्यांच्या बेचक्यात मुरुंबदेवाचा डोंगर उभा आहे. मावळ भागामध्ये राज्यविस्तार साध्य करण्यासाठी राजगड आणि तोरणा हे दोन्ही किल्ले मोक्याच्या ठिकाणी होते. तोरणा Archived 2020-09-20 at the Wayback Machine. किल्ल्याचा बालेकिल्ला आकाराने लहान असल्यामुळे राजकीय केंद्र म्हणून हा किल्ला सोयीचा नव्हता. त्यामानाने राजगड दुर्गम असून त्याचा बालेकिल्ला बराच मोठा आहे.'

In [66]:
word_tokenize(mtext)

['४८',
 'कि',
 '.',
 'मी',
 '.',
 'अंतरावर',
 'आणि',
 'पुणे',
 'जिल्ह्यातील',
 'वेल्हे',
 'तालुक्यात',
 'व',
 'भोर',
 'गावाच्या',
 'वायव्येला',
 '२४',
 'कि.मी',
 '.',
 'अंतरावर',
 'नीरा-वेळवंडी-कानंदी',
 'आणि',
 'गुंजवणी',
 'या',
 'नद्यांच्या',
 'खोऱ्यांच्या',
 'बेचक्यात',
 'मुरुंबदेवाचा',
 'डोंगर',
 'उभा',
 'आहे',
 '.',
 'मावळ',
 'भागामध्ये',
 'राज्यविस्तार',
 'साध्य',
 'करण्यासाठी',
 'राजगड',
 'आणि',
 'तोरणा',
 'हे',
 'दोन्ही',
 'किल्ले',
 'मोक्याच्या',
 'ठिकाणी',
 'होते',
 '.',
 'तोरणा',
 'Archived',
 '2020-09-20',
 'at',
 'the',
 'Wayback',
 'Machine',
 '.',
 'किल्ल्याचा',
 'बालेकिल्ला',
 'आकाराने',
 'लहान',
 'असल्यामुळे',
 'राजकीय',
 'केंद्र',
 'म्हणून',
 'हा',
 'किल्ला',
 'सोयीचा',
 'नव्हता',
 '.',
 'त्यामानाने',
 'राजगड',
 'दुर्गम',
 'असून',
 'त्याचा',
 'बालेकिल्ला',
 'बराच',
 'मोठा',
 'आहे',
 '.']

##### Space Tokenizer

In [72]:
f = open('mydata.txt')
data = f.read()

In [73]:
print(data)

Hello Friends! How are you?
Welcome to the world of Python Programming.


In [76]:
# import the class
from nltk.tokenize import SpaceTokenizer

# create the object
tk = SpaceTokenizer()

# tokenize the data
tk.tokenize(data)

['Hello',
 'Friends!',
 'How',
 'are',
 'you?\nWelcome',
 'to',
 'the',
 'world',
 'of',
 'Python',
 'Programming.']

##### Tab Tokenizer

In [78]:
f = open('mydata.txt')
data = f.read()
print(data)

Hello Friends!	How are you?
Welcome to the world of	Python Programming.



In [79]:
# import the class
from nltk.tokenize import TabTokenizer

# create the object
tk = TabTokenizer()

# tokenize the data
tk.tokenize(data)

['Hello Friends!',
 'How are you?\nWelcome to the world of',
 'Python Programming.\n']

##### Line Tokenizer

In [81]:
# import the class
from nltk.tokenize import LineTokenizer

# create the object
tk = LineTokenizer()

# tokenize the data
tk.tokenize(data)

['Hello Friends!\tHow are you?',
 'Welcome to the world of\tPython Programming.']

##### Whitespace Tokenizer

In [83]:
# import the class
from nltk.tokenize import WhitespaceTokenizer

# create the object
tk = WhitespaceTokenizer()

# tokenize the data
tk.tokenize(data)

['Hello',
 'Friends!',
 'How',
 'are',
 'you?',
 'Welcome',
 'to',
 'the',
 'world',
 'of',
 'Python',
 'Programming.']

##### MWE Tokenizer

In [85]:
sent1 = '''The Van Rossum is Python creator, visting Pune this week. The 
development community is very eager to meet Van Rossum.'''

In [86]:
print(sent1)

The Van Rossum is Python creator, visting Pune this week. The 
development community is very eager to meet Van Rossum.


In [87]:
word_tokenize(sent1)

['The',
 'Van',
 'Rossum',
 'is',
 'Python',
 'creator',
 ',',
 'visting',
 'Pune',
 'this',
 'week',
 '.',
 'The',
 'development',
 'community',
 'is',
 'very',
 'eager',
 'to',
 'meet',
 'Van',
 'Rossum',
 '.']

In [90]:
# import the class
from nltk.tokenize import MWETokenizer

# create the object
tk = MWETokenizer(separator=' ')

# add Multi Word Expression
tk.add_mwe(('Van','Rossum'))

# tokenize the data
tk.tokenize(word_tokenize(sent1))

['The',
 'Van Rossum',
 'is',
 'Python',
 'creator',
 ',',
 'visting',
 'Pune',
 'this',
 'week',
 '.',
 'The',
 'development',
 'community',
 'is',
 'very',
 'eager',
 'to',
 'meet',
 'Van Rossum',
 '.']

##### Tweet Tokenizer

In [92]:
sent = 'Hello Friends :)! How are you? Welcome to the world of Python Programming. :D'

In [93]:
print(sent)

Hello Friends :)! How are you? Welcome to the world of Python Programming. :D


In [94]:
# import the class
from nltk.tokenize import TweetTokenizer

# create the object
tk = TweetTokenizer()

# tokenize the data
tk.tokenize(sent)

['Hello',
 'Friends',
 ':)',
 '!',
 'How',
 'are',
 'you',
 '?',
 'Welcome',
 'to',
 'the',
 'world',
 'of',
 'Python',
 'Programming',
 '.',
 ':D']

In [96]:
f = open('mydata.txt', encoding='utf-8')
data = f.read()
print(data)

Hello Friends!😀 How are you?👋
Welcome🙏🏼 to the world🌎 of Python 💻Programming.



In [98]:
word_tokenize(data)

['Hello',
 'Friends',
 '!',
 '😀',
 'How',
 'are',
 'you',
 '?',
 '👋',
 'Welcome🙏🏼',
 'to',
 'the',
 'world🌎',
 'of',
 'Python',
 '💻Programming',
 '.']

In [97]:
tk.tokenize(data)

['Hello',
 'Friends',
 '!',
 '😀',
 'How',
 'are',
 'you',
 '?',
 '👋',
 'Welcome',
 '🙏🏼',
 'to',
 'the',
 'world',
 '🌎',
 'of',
 'Python',
 '💻',
 'Programming',
 '.']

##### Custom Tokenizer

In [100]:
import re

def custom_tokenizer(text):
    return re.split(r"[.,;?!\s]+", text)

text = "This is some text with punctuation > Let's tokenize it. Is it ok?"

tokens = custom_tokenizer(text)

print("Tokens:")
for token in tokens:
    print(token)

Tokens:
This
is
some
text
with
punctuation
>
Let's
tokenize
it
Is
it
ok



In [101]:
# https://mitu.co.in/dataset
# Download: student3.tsv

In [102]:
f = open('student3.tsv')
data = f.read()

In [104]:
print(data)

roll	name	class	marks	age
1	anil	TE	56.77	22
2	amit	TE	59.77	21
3	aniket	BE	76.88	19
4	ajinkya	TE	69.66	20
5	asha	TE	63.28	20
6	ayesha	BE	49.55	20
7	amar	BE	65.34	19
8	amita	BE	68.33	23
9	amol	TE	56.75	20
10	anmol	BE	78.66	21



In [123]:
newdata = []
for x in data.split('\n'):
    inner_list = []
    for y in x.split('\t'):
        if y.isdigit():
            inner_list.append(int(y))
        elif y.find('.') > 0:
            inner_list.append(float(y))
        else:
            inner_list.append(y)
    newdata.append(inner_list)
newdata

[['roll', 'name', 'class', 'marks', 'age'],
 [1, 'anil', 'TE', 56.77, 22],
 [2, 'amit', 'TE', 59.77, 21],
 [3, 'aniket', 'BE', 76.88, 19],
 [4, 'ajinkya', 'TE', 69.66, 20],
 [5, 'asha', 'TE', 63.28, 20],
 [6, 'ayesha', 'BE', 49.55, 20],
 [7, 'amar', 'BE', 65.34, 19],
 [8, 'amita', 'BE', 68.33, 23],
 [9, 'amol', 'TE', 56.75, 20],
 [10, 'anmol', 'BE', 78.66, 21],
 ['']]