In [1]:
import nltk

In [2]:
text = 'NLP is a branch of data science that consists of systematic processes for analyzing, understanding, and deriving information from the text data in a smart and efficient manner. By utilizing NLP and its components, one can organize the massive chunks of text data, perform numerous automated tasks and solve a wide range of problems such as – automatic summarization, machine translation, named entity recognition, relationship extraction, sentiment analysis, speech recognition, and topic segmentation etc.'

** Tokenize **

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [4]:
sentences = sent_tokenize(text)
for sentence in sentences:
    print(sentence)

NLP is a branch of data science that consists of systematic processes for analyzing, understanding, and deriving information from the text data in a smart and efficient manner.
By utilizing NLP and its components, one can organize the massive chunks of text data, perform numerous automated tasks and solve a wide range of problems such as – automatic summarization, machine translation, named entity recognition, relationship extraction, sentiment analysis, speech recognition, and topic segmentation etc.


In [5]:
word_bag = [word_tokenize(sent) for sent in sentences]
print(word_bag)

[['NLP', 'is', 'a', 'branch', 'of', 'data', 'science', 'that', 'consists', 'of', 'systematic', 'processes', 'for', 'analyzing', ',', 'understanding', ',', 'and', 'deriving', 'information', 'from', 'the', 'text', 'data', 'in', 'a', 'smart', 'and', 'efficient', 'manner', '.'], ['By', 'utilizing', 'NLP', 'and', 'its', 'components', ',', 'one', 'can', 'organize', 'the', 'massive', 'chunks', 'of', 'text', 'data', ',', 'perform', 'numerous', 'automated', 'tasks', 'and', 'solve', 'a', 'wide', 'range', 'of', 'problems', 'such', 'as', '–', 'automatic', 'summarization', ',', 'machine', 'translation', ',', 'named', 'entity', 'recognition', ',', 'relationship', 'extraction', ',', 'sentiment', 'analysis', ',', 'speech', 'recognition', ',', 'and', 'topic', 'segmentation', 'etc', '.']]


** Removing Stopwords **

In [6]:
from nltk.corpus import stopwords
from string import punctuation

customStopwords = set(stopwords.words('english') + list(punctuation))

In [7]:
word_WO_StopWords = [word for word in word_tokenize(text) if word not in customStopwords]

print(word_WO_StopWords)

['NLP', 'branch', 'data', 'science', 'consists', 'systematic', 'processes', 'analyzing', 'understanding', 'deriving', 'information', 'text', 'data', 'smart', 'efficient', 'manner', 'By', 'utilizing', 'NLP', 'components', 'one', 'organize', 'massive', 'chunks', 'text', 'data', 'perform', 'numerous', 'automated', 'tasks', 'solve', 'wide', 'range', 'problems', '–', 'automatic', 'summarization', 'machine', 'translation', 'named', 'entity', 'recognition', 'relationship', 'extraction', 'sentiment', 'analysis', 'speech', 'recognition', 'topic', 'segmentation', 'etc']


** Identify the BigGrams **

In [8]:
from nltk.collocations import *

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(word_WO_StopWords)

bigrams = list(finder.ngram_fd.items())
print(bigrams)

[(('NLP', 'branch'), 1), (('branch', 'data'), 1), (('data', 'science'), 1), (('science', 'consists'), 1), (('consists', 'systematic'), 1), (('systematic', 'processes'), 1), (('processes', 'analyzing'), 1), (('analyzing', 'understanding'), 1), (('understanding', 'deriving'), 1), (('deriving', 'information'), 1), (('information', 'text'), 1), (('text', 'data'), 2), (('data', 'smart'), 1), (('smart', 'efficient'), 1), (('efficient', 'manner'), 1), (('manner', 'By'), 1), (('By', 'utilizing'), 1), (('utilizing', 'NLP'), 1), (('NLP', 'components'), 1), (('components', 'one'), 1), (('one', 'organize'), 1), (('organize', 'massive'), 1), (('massive', 'chunks'), 1), (('chunks', 'text'), 1), (('data', 'perform'), 1), (('perform', 'numerous'), 1), (('numerous', 'automated'), 1), (('automated', 'tasks'), 1), (('tasks', 'solve'), 1), (('solve', 'wide'), 1), (('wide', 'range'), 1), (('range', 'problems'), 1), (('problems', '–'), 1), (('–', 'automatic'), 1), (('automatic', 'summarization'), 1), (('sum

In [9]:
# Taking them in sorted order
bigrams.sort(key=lambda item: item[-1], reverse=True)

In [10]:
for k, v in bigrams:
    print(k, v)

('text', 'data') 2
('NLP', 'branch') 1
('branch', 'data') 1
('data', 'science') 1
('science', 'consists') 1
('consists', 'systematic') 1
('systematic', 'processes') 1
('processes', 'analyzing') 1
('analyzing', 'understanding') 1
('understanding', 'deriving') 1
('deriving', 'information') 1
('information', 'text') 1
('data', 'smart') 1
('smart', 'efficient') 1
('efficient', 'manner') 1
('manner', 'By') 1
('By', 'utilizing') 1
('utilizing', 'NLP') 1
('NLP', 'components') 1
('components', 'one') 1
('one', 'organize') 1
('organize', 'massive') 1
('massive', 'chunks') 1
('chunks', 'text') 1
('data', 'perform') 1
('perform', 'numerous') 1
('numerous', 'automated') 1
('automated', 'tasks') 1
('tasks', 'solve') 1
('solve', 'wide') 1
('wide', 'range') 1
('range', 'problems') 1
('problems', '–') 1
('–', 'automatic') 1
('automatic', 'summarization') 1
('summarization', 'machine') 1
('machine', 'translation') 1
('translation', 'named') 1
('named', 'entity') 1
('entity', 'recognition') 1
('recognit

** Stemming ** <br/>
** Parts of Speech Tagging **

In [11]:
text2 = "Mary closed on closing night when she was in a mood to close"

In [12]:
from nltk.stem.lancaster import LancasterStemmer

st = LancasterStemmer()

In [13]:
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]

print(stemmedWords)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'a', 'mood', 'to', 'clos']


In [14]:
nltk.pos_tag(word_tokenize(text2))

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB')]

** Word Sense Disambiguation **

In [15]:
from nltk.corpus import wordnet as wn

for ss in wn.synsets('bass'):
    print(ss, ss.definition())

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [16]:
#  lesk is an algorithm for word sense disambiguation

from nltk.wsd import lesk

In [17]:
sense1 = lesk(word_tokenize('sing in a lower tone, along with the bass'), 'bass')
print(sense1, sense1.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments


In [18]:
sense2 = lesk(word_tokenize('This sea bass was really hard to catch'), 'bass')
print(sense2, sense2.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae


In [19]:
sense3 = lesk(word_tokenize('The bass of this sea were hard to catch'), 'bass')
print(sense3, sense3.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
